sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,2575 @@
1
+ #include "lang.h"
2
+ #ifdef FRT_RUBY_VERSION_1_9
3
+ # include <ruby/re.h>
4
+ #else
5
+ # include <regex.h>
6
+ #endif
7
+ #include <locale.h>
8
+ #include <ruby/st.h>
9
+ #include "ferret.h"
10
+ #include "analysis.h"
11
+
12
+ static char *frt_locale = NULL;
13
+
14
+ static VALUE mAnalysis;
15
+
16
+ static VALUE cToken;
17
+ static VALUE cAsciiLetterTokenizer;
18
+ static VALUE cLetterTokenizer;
19
+ static VALUE cAsciiWhiteSpaceTokenizer;
20
+ static VALUE cWhiteSpaceTokenizer;
21
+ static VALUE cAsciiStandardTokenizer;
22
+ static VALUE cStandardTokenizer;
23
+ static VALUE cRegExpTokenizer;
24
+
25
+ static VALUE cAsciiLowerCaseFilter;
26
+ static VALUE cLowerCaseFilter;
27
+ static VALUE cStopFilter;
28
+ static VALUE cMappingFilter;
29
+ static VALUE cHyphenFilter;
30
+ static VALUE cStemFilter;
31
+
32
+ static VALUE cAnalyzer;
33
+ static VALUE cAsciiLetterAnalyzer;
34
+ static VALUE cLetterAnalyzer;
35
+ static VALUE cAsciiWhiteSpaceAnalyzer;
36
+ static VALUE cWhiteSpaceAnalyzer;
37
+ static VALUE cAsciiStandardAnalyzer;
38
+ static VALUE cStandardAnalyzer;
39
+ static VALUE cPerFieldAnalyzer;
40
+ static VALUE cRegExpAnalyzer;
41
+
42
+ static VALUE cTokenStream;
43
+
44
+ /* TokenStream Methods */
45
+ static ID id_next;
46
+ static ID id_reset;
47
+ static ID id_clone;
48
+ static ID id_text;
49
+
50
+ /* Analyzer Methods */
51
+ static ID id_token_stream;
52
+
53
+ static VALUE object_space;
54
+
55
+ #ifndef FRT_RUBY_VERSION_1_9
56
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
57
+ int, struct re_registers *);
58
+ #endif
59
+
60
+ int
61
+ frt_rb_hash_size(VALUE hash)
62
+ {
63
+ return RHASH(hash)->ntbl->num_entries;
64
+ }
65
+
66
+ /****************************************************************************
67
+ *
68
+ * Utility Methods
69
+ *
70
+ ****************************************************************************/
71
+
72
+ static char **
73
+ get_stopwords(VALUE rstop_words)
74
+ {
75
+ char **stop_words;
76
+ int i, len;
77
+ VALUE rstr;
78
+ Check_Type(rstop_words, T_ARRAY);
79
+ len = RARRAY_LEN(rstop_words);
80
+ stop_words = ALLOC_N(char *, RARRAY_LEN(rstop_words) + 1);
81
+ stop_words[len] = NULL;
82
+ for (i = 0; i < len; i++) {
83
+ rstr = rb_obj_as_string(RARRAY_PTR(rstop_words)[i]);
84
+ stop_words[i] = rs2s(rstr);
85
+ }
86
+ return stop_words;
87
+ }
88
+
89
+ /****************************************************************************
90
+ *
91
+ * token methods
92
+ *
93
+ ****************************************************************************/
94
+
95
+ typedef struct RToken {
96
+ VALUE text;
97
+ int start;
98
+ int end;
99
+ int pos_inc;
100
+ } RToken;
101
+
102
+ static void
103
+ frt_token_free(void *p)
104
+ {
105
+ free(p);
106
+ }
107
+
108
+ static void
109
+ frt_token_mark(void *p)
110
+ {
111
+ RToken *token = (RToken *)p;
112
+ rb_gc_mark(token->text);
113
+ }
114
+
115
+ static VALUE
116
+ frt_token_alloc(VALUE klass)
117
+ {
118
+ return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free,
119
+ ALLOC(RToken));
120
+ }
121
+
122
+ static VALUE
123
+ get_token(Token *tk)
124
+ {
125
+ RToken *token = ALLOC(RToken);
126
+
127
+ token->text = rb_str_new2(tk->text);
128
+ token->start = tk->start;
129
+ token->end = tk->end;
130
+ token->pos_inc = tk->pos_inc;
131
+ return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
132
+ }
133
+
134
+ Token *
135
+ frt_set_token(Token *tk, VALUE rt)
136
+ {
137
+ RToken *rtk;
138
+
139
+ if (rt == Qnil) return NULL;
140
+
141
+ Data_Get_Struct(rt, RToken, rtk);
142
+ tk_set(tk, rs2s(rtk->text), RSTRING_LEN(rtk->text),
143
+ rtk->start, rtk->end, rtk->pos_inc);
144
+ return tk;
145
+ }
146
+
147
+ #define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
148
+
149
+ /*
150
+ * call-seq:
151
+ * Token.new(text, start, end, pos_inc = 1) -> new Token
152
+ *
153
+ * Creates a new token setting the text, start and end offsets of the token
154
+ * and the position increment for the token.
155
+ *
156
+ * The position increment is usually set to 1 but you can set it to other
157
+ * values as needed. For example, if you have a stop word filter you will be
158
+ * skipping tokens. Let's say you have the stop words "the" and "and" and you
159
+ * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
160
+ * "Sea" will have the position increments 2, 1 and 3 respectively.
161
+ *
162
+ * Another reason you might want to vary the position increment is if you are
163
+ * adding synonyms to the index. For example let's say you have the synonym
164
+ * group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
165
+ * speedy delivery", you'll add "speedy" first with a position increment of 1
166
+ * and then "fast" and "quick" with position increments of 0 since they are
167
+ * represented in the same position.
168
+ *
169
+ * The offset set values +start+ and +end+ should be byte offsets, not
170
+ * character offsets. This makes it easy to use those offsets to quickly
171
+ * access the token in the input string and also to insert highlighting tags
172
+ * when necessary.
173
+ *
174
+ * text:: the main text for the token.
175
+ * start:: the start offset of the token in bytes.
176
+ * end:: the end offset of the token in bytes.
177
+ * pos_inc:: the position increment of a token. See above.
178
+ * return:: a newly created and assigned Token object
179
+ */
180
+ static VALUE
181
+ frt_token_init(int argc, VALUE *argv, VALUE self)
182
+ {
183
+ RToken *token;
184
+ VALUE rtext, rstart, rend, rpos_inc, rtype;
185
+ GET_TK(token, self);
186
+ token->pos_inc = 1;
187
+ switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
188
+ &rend, &rpos_inc, &rtype)) {
189
+ case 5: /* type gets ignored at this stage */
190
+ case 4: token->pos_inc = FIX2INT(rpos_inc);
191
+ }
192
+ token->text = rb_obj_as_string(rtext);
193
+ token->start = FIX2INT(rstart);
194
+ token->end = FIX2INT(rend);
195
+ return self;
196
+ }
197
+
198
+ /*
199
+ * call-seq:
200
+ * token.cmp(other_token) -> bool
201
+ *
202
+ * Used to compare two tokens. Token is extended by Comparable so you can
203
+ * also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
204
+ *
205
+ * Tokens are sorted by the position in the text at which they occur, ie
206
+ * the start offset. If two tokens have the same start offset, (see
207
+ * pos_inc=) then, they are sorted by the end offset and then
208
+ * lexically by the token text.
209
+ */
210
+ static VALUE
211
+ frt_token_cmp(VALUE self, VALUE rother)
212
+ {
213
+ RToken *token, *other;
214
+ int cmp;
215
+ GET_TK(token, self);
216
+ GET_TK(other, rother);
217
+ if (token->start > other->start) {
218
+ cmp = 1;
219
+ } else if (token->start < other->start) {
220
+ cmp = -1;
221
+ } else {
222
+ if (token->end > other->end) {
223
+ cmp = 1;
224
+ } else if (token->end < other->end) {
225
+ cmp = -1;
226
+ } else {
227
+ cmp = strcmp(rs2s(token->text), rs2s(other->text));
228
+ }
229
+ }
230
+ return INT2FIX(cmp);
231
+ }
232
+
233
+ /*
234
+ * call-seq:
235
+ * token.text -> text
236
+ *
237
+ * Returns the text that this token represents
238
+ */
239
+ static VALUE
240
+ frt_token_get_text(VALUE self)
241
+ {
242
+ RToken *token;
243
+ GET_TK(token, self);
244
+ return token->text;
245
+ }
246
+
247
+ /*
248
+ * call-seq:
249
+ * token.text = text -> text
250
+ *
251
+ * Set the text for this token.
252
+ */
253
+ static VALUE
254
+ frt_token_set_text(VALUE self, VALUE rtext)
255
+ {
256
+ RToken *token;
257
+ GET_TK(token, self);
258
+ token->text = rtext;
259
+ return rtext;
260
+ }
261
+
262
+ /*
263
+ * call-seq:
264
+ * token.start -> integer
265
+ *
266
+ * Start byte-position of this token
267
+ */
268
+ static VALUE
269
+ frt_token_get_start_offset(VALUE self)
270
+ {
271
+ RToken *token;
272
+ GET_TK(token, self);
273
+ return INT2FIX(token->start);
274
+ }
275
+
276
+ /*
277
+ * call-seq:
278
+ * token.end -> integer
279
+ *
280
+ * End byte-position of this token
281
+ */
282
+ static VALUE
283
+ frt_token_get_end_offset(VALUE self)
284
+ {
285
+ RToken *token;
286
+ GET_TK(token, self);
287
+ return INT2FIX(token->end);
288
+ }
289
+
290
+ /*
291
+ * call-seq:
292
+ * token.pos_inc -> integer
293
+ *
294
+ * Position Increment for this token
295
+ */
296
+ static VALUE
297
+ frt_token_get_pos_inc(VALUE self)
298
+ {
299
+ RToken *token;
300
+ GET_TK(token, self);
301
+ return INT2FIX(token->pos_inc);
302
+ }
303
+
304
+ /*
305
+ * call-seq:
306
+ * token.start = start -> integer
307
+ *
308
+ * Set start byte-position of this token
309
+ */
310
+ static VALUE
311
+ frt_token_set_start_offset(VALUE self, VALUE rstart)
312
+ {
313
+ RToken *token;
314
+ GET_TK(token, self);
315
+ token->start = FIX2INT(rstart);
316
+ return rstart;
317
+ }
318
+
319
+ /*
320
+ * call-seq:
321
+ * token.end = end -> integer
322
+ *
323
+ * Set end byte-position of this token
324
+ */
325
+ static VALUE
326
+ frt_token_set_end_offset(VALUE self, VALUE rend)
327
+ {
328
+ RToken *token;
329
+ GET_TK(token, self);
330
+ token->end = FIX2INT(rend);
331
+ return rend;
332
+ }
333
+
334
+ /*
335
+ * call-seq:
336
+ * token.pos_inc = pos_inc -> integer
337
+ *
338
+ * Set the position increment. This determines the position of this token
339
+ * relative to the previous Token in a TokenStream, used in phrase
340
+ * searching.
341
+ *
342
+ * The default value is 1.
343
+ *
344
+ * Some common uses for this are:
345
+ *
346
+ * * Set it to zero to put multiple terms in the same position. This is
347
+ * useful if, e.g., a word has multiple stems. Searches for phrases
348
+ * including either stem will match. In this case, all but the first
349
+ * stem's increment should be set to zero: the increment of the first
350
+ * instance should be one. Repeating a token with an increment of zero
351
+ * can also be used to boost the scores of matches on that token.
352
+ *
353
+ * * Set it to values greater than one to inhibit exact phrase matches.
354
+ * If, for example, one does not want phrases to match across removed
355
+ * stop words, then one could build a stop word filter that removes stop
356
+ * words and also sets the increment to the number of stop words removed
357
+ * before each non-stop word. Then exact phrase queries will only match
358
+ * when the terms occur with no intervening stop words.
359
+ *
360
+ */
361
+ static VALUE
362
+ frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
363
+ {
364
+ RToken *token;
365
+ GET_TK(token, self);
366
+ token->pos_inc = FIX2INT(rpos_inc);
367
+ return rpos_inc;
368
+ }
369
+
370
+ /*
371
+ * call-seq:
372
+ * token.to_s -> token_str
373
+ *
374
+ * Return a string representation of the token
375
+ */
376
+ static VALUE
377
+ frt_token_to_s(VALUE self)
378
+ {
379
+ RToken *token;
380
+ char *buf;
381
+ GET_TK(token, self);
382
+ buf = alloca(RSTRING_LEN(token->text) + 80);
383
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
384
+ token->start, token->end, token->pos_inc);
385
+ return rb_str_new2(buf);
386
+ }
387
+
388
+ /****************************************************************************
389
+ *
390
+ * TokenStream Methods
391
+ *
392
+ ****************************************************************************/
393
+
394
+ #define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
395
+
396
+ static void
397
+ frt_ts_mark(void *p)
398
+ {
399
+ TokenStream *ts = (TokenStream *)p;
400
+ if (ts->text) frt_gc_mark(&ts->text);
401
+ }
402
+
403
+ static void
404
+ frt_ts_free(TokenStream *ts)
405
+ {
406
+ if (object_get(&ts->text) != Qnil) {
407
+ object_del(&ts->text);
408
+ }
409
+ object_del(ts);
410
+ ts_deref(ts);
411
+ }
412
+
413
+ static void frt_rets_free(TokenStream *ts);
414
+ static void frt_rets_mark(TokenStream *ts);
415
+ static Token *rets_next(TokenStream *ts);
416
+
417
+ static VALUE
418
+ get_rb_token_stream(TokenStream *ts)
419
+ {
420
+ VALUE rts = object_get(ts);
421
+ if (rts == Qnil) {
422
+ if (ts->next == &rets_next) {
423
+ rts = Data_Wrap_Struct(cTokenStream, &frt_rets_mark,
424
+ &frt_rets_free, ts);
425
+ } else {
426
+ rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark,
427
+ &frt_ts_free, ts);
428
+ }
429
+ object_add(ts, rts);
430
+ }
431
+ return rts;
432
+ }
433
+
434
+ static INLINE VALUE
435
+ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
436
+ {
437
+ StringValue(rstr);
438
+ ts->reset(ts, rs2s(rstr));
439
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
440
+ object_add(&ts->text, rstr);
441
+ object_add(ts, self);
442
+ return self;
443
+ }
444
+
445
+ /*
446
+ * call-seq:
447
+ * token_stream.text = text -> text
448
+ *
449
+ * Set the text attribute of the TokenStream to the text you wish to be
450
+ * tokenized. For example, you may do this;
451
+ *
452
+ * token_stream.text = File.read(file_name)
453
+ */
454
+ static VALUE
455
+ frt_ts_set_text(VALUE self, VALUE rtext)
456
+ {
457
+ TokenStream *ts;
458
+ Data_Get_Struct(self, TokenStream, ts);
459
+ StringValue(rtext);
460
+ ts->reset(ts, rs2s(rtext));
461
+
462
+ /* prevent garbage collection */
463
+ rb_ivar_set(self, id_text, rtext);
464
+
465
+ return rtext;
466
+ }
467
+
468
+ /*
469
+ * call-seq:
470
+ * token_stream.text = text -> text
471
+ *
472
+ * Return the text that the TokenStream is tokenizing
473
+ */
474
+ static VALUE
475
+ frt_ts_get_text(VALUE self)
476
+ {
477
+ VALUE rtext = Qnil;
478
+ TokenStream *ts;
479
+ Data_Get_Struct(self, TokenStream, ts);
480
+ if ((rtext = object_get(&ts->text)) == Qnil) {
481
+ if (ts->text) {
482
+ rtext = rb_str_new2(ts->text);
483
+ object_set(&ts->text, rtext);
484
+ }
485
+ }
486
+ return rtext;
487
+ }
488
+
489
+ /*
490
+ * call-seq:
491
+ * token_stream.next -> token
492
+ *
493
+ * Return the next token from the TokenStream or nil if there are no more
494
+ * tokens.
495
+ */
496
+ static VALUE
497
+ frt_ts_next(VALUE self)
498
+ {
499
+ TokenStream *ts;
500
+ Token *next;
501
+ GET_TS(ts, self);
502
+ next = ts->next(ts);
503
+ if (next == NULL) {
504
+ return Qnil;
505
+ }
506
+
507
+ return get_token(next);
508
+ }
509
+
510
+ /****************************************************************************
511
+ * TokenFilter
512
+ ****************************************************************************/
513
+
514
+ #define TkFilt(filter) ((TokenFilter *)(filter))
515
+
516
+ static void
517
+ frt_tf_mark(void *p)
518
+ {
519
+ TokenStream *ts = (TokenStream *)p;
520
+ if (TkFilt(ts)->sub_ts) {
521
+ frt_gc_mark(&TkFilt(ts)->sub_ts);
522
+ }
523
+ }
524
+
525
+ static void
526
+ frt_tf_free(TokenStream *ts)
527
+ {
528
+ if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
529
+ object_del(&TkFilt(ts)->sub_ts);
530
+ }
531
+ object_del(ts);
532
+ ts_deref(ts);
533
+ }
534
+
535
+
536
+ /****************************************************************************
537
+ * CWrappedTokenStream
538
+ ****************************************************************************/
539
+
540
+ #define CachedTS(token_stream) ((CachedTokenStream *)(token_stream))
541
+ #define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
542
+
543
+ typedef struct CWrappedTokenStream {
544
+ CachedTokenStream super;
545
+ VALUE rts;
546
+ } CWrappedTokenStream;
547
+
548
+ static void
549
+ cwrts_destroy_i(TokenStream *ts)
550
+ {
551
+ if (object_get(&ts->text) != Qnil) {
552
+ object_del(&ts->text);
553
+ }
554
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
555
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
556
+ free(ts);
557
+ }
558
+
559
+ static Token *
560
+ cwrts_next(TokenStream *ts)
561
+ {
562
+ VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
563
+ return frt_set_token(&(CachedTS(ts)->token), rtoken);
564
+ }
565
+
566
+ static TokenStream *
567
+ cwrts_reset(TokenStream *ts, char *text)
568
+ {
569
+ ts->t = ts->text = text;
570
+ rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
571
+ return ts;
572
+ }
573
+
574
+ static TokenStream *
575
+ cwrts_clone_i(TokenStream *orig_ts)
576
+ {
577
+ TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
578
+ VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
579
+ rb_hash_aset(object_space, ((VALUE)new_ts)|1, rts);
580
+ return new_ts;
581
+ }
582
+
583
+ static TokenStream *
584
+ frt_get_cwrapped_rts(VALUE rts)
585
+ {
586
+ TokenStream *ts;
587
+ if (frt_is_cclass(rts) && DATA_PTR(rts)) {
588
+ GET_TS(ts, rts);
589
+ REF(ts);
590
+ }
591
+ else {
592
+ ts = ts_new(CWrappedTokenStream);
593
+ CWTS(ts)->rts = rts;
594
+ ts->next = &cwrts_next;
595
+ ts->reset = &cwrts_reset;
596
+ ts->clone_i = &cwrts_clone_i;
597
+ ts->destroy_i = &cwrts_destroy_i;
598
+ /* prevent from being garbage collected */
599
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rts);
600
+ ts->ref_cnt = 1;
601
+ }
602
+ return ts;
603
+ }
604
+
605
+ /****************************************************************************
606
+ * RegExpTokenStream
607
+ ****************************************************************************/
608
+
609
+ #define P "[_\\/.,-]"
610
+ #define HASDIGIT "\\w*\\d\\w*"
611
+ #define ALPHA "[-_[:alpha:]]"
612
+ #define ALNUM "[-_[:alnum:]]"
613
+
614
+ #define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
615
+
616
+ static const char *TOKEN_RE =
617
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
618
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
619
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
620
+ "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
621
+ "|(\\.\\w+)+"
622
+ "|"
623
+ ")";
624
+ static VALUE rtoken_re;
625
+
626
+ typedef struct RegExpTokenStream {
627
+ CachedTokenStream super;
628
+ VALUE rtext;
629
+ VALUE regex;
630
+ VALUE proc;
631
+ long curr_ind;
632
+ } RegExpTokenStream;
633
+
634
+ static void
635
+ rets_destroy_i(TokenStream *ts)
636
+ {
637
+ if (object_get(&ts->text) != Qnil) {
638
+ object_del(&ts->text);
639
+ }
640
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
641
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
642
+ free(ts);
643
+ }
644
+
645
+ static void
646
+ frt_rets_free(TokenStream *ts)
647
+ {
648
+ if (object_get(&ts->text) != Qnil) {
649
+ object_del(&ts->text);
650
+ }
651
+ object_del(ts);
652
+ ts_deref(ts);
653
+ }
654
+
655
+ static void
656
+ frt_rets_mark(TokenStream *ts)
657
+ {
658
+ if (ts->text) frt_gc_mark(&ts->text);
659
+ rb_gc_mark(RETS(ts)->rtext);
660
+ rb_gc_mark(RETS(ts)->regex);
661
+ rb_gc_mark(RETS(ts)->proc);
662
+ }
663
+
664
+ /*
665
+ * call-seq:
666
+ * tokenizer.text = text -> text
667
+ *
668
+ * Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
669
+ * tokenize the text from the beginning.
670
+ */
671
+ static VALUE
672
+ frt_rets_set_text(VALUE self, VALUE rtext)
673
+ {
674
+ TokenStream *ts;
675
+ GET_TS(ts, self);
676
+
677
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
678
+ StringValue(rtext);
679
+ RETS(ts)->rtext = rtext;
680
+ RETS(ts)->curr_ind = 0;
681
+
682
+ return rtext;
683
+ }
684
+
685
+ /*
686
+ * call-seq:
687
+ * tokenizer.text = text -> text
688
+ *
689
+ * Get the text being tokenized by the tokenizer.
690
+ */
691
+ static VALUE
692
+ frt_rets_get_text(VALUE self)
693
+ {
694
+ TokenStream *ts;
695
+ GET_TS(ts, self);
696
+ return RETS(ts)->rtext;
697
+ }
698
+
699
+ #ifdef FRT_RUBY_VERSION_1_9
700
+
701
+ // partly lifted from ruby 1.9 string.c
702
+ #include <ruby/encoding.h>
703
+ #define BEG(no) regs->beg[no]
704
+ #define END(no) regs->end[no]
705
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
706
+ static VALUE
707
+ scan_once(VALUE str, VALUE pat, long *start)
708
+ {
709
+ VALUE match;
710
+ struct re_registers *regs;
711
+
712
+ if (rb_reg_search(pat, str, *start, 0) >= 0) {
713
+ match = rb_backref_get();
714
+ regs = RMATCH_REGS(match);
715
+ if (BEG(0) == END(0)) {
716
+ rb_encoding *enc = STR_ENC_GET(str);
717
+ /*
718
+ * Always consume at least one character of the input string
719
+ */
720
+ if (RSTRING_LEN(str) > END(0))
721
+ *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
722
+ RSTRING_END(str), enc);
723
+ else
724
+ *start = END(0)+1;
725
+ }
726
+ else {
727
+ *start = END(0);
728
+ }
729
+ return rb_reg_nth_match(0, match);
730
+ }
731
+ return Qnil;
732
+ }
733
+ //
734
+
735
+ static Token *
736
+ rets_next(TokenStream *ts)
737
+ {
738
+ VALUE ret;
739
+ long rtok_len;
740
+ int beg, end;
741
+ Check_Type(RETS(ts)->regex, T_REGEXP);
742
+ ret = scan_once(RETS(ts)->rtext, RETS(ts)->regex, &(RETS(ts)->curr_ind));
743
+ if (NIL_P(ret)) return NULL;
744
+
745
+ Check_Type(ret, T_STRING);
746
+ rtok_len = RSTRING_LEN(ret);
747
+ beg = RETS(ts)->curr_ind - rtok_len;
748
+ end = RETS(ts)->curr_ind;
749
+
750
+ if (NIL_P(RETS(ts)->proc)) {
751
+ return tk_set(&(CachedTS(ts)->token), rs2s(ret), rtok_len,
752
+ beg, end, 1);
753
+ } else {
754
+ VALUE rtok;
755
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
756
+ return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
757
+ RSTRING_LEN(rtok), beg, end, 1);
758
+ }
759
+ }
760
+
761
+ #else
762
+
763
+ static Token *
764
+ rets_next(TokenStream *ts)
765
+ {
766
+ static struct re_registers regs;
767
+ int ret, beg, end;
768
+ struct RString *rtext = RSTRING(RETS(ts)->rtext);
769
+ long rtext_len = RSTRING_LEN(RETS(ts)->rtext);
770
+ char *rtext_ptr = RSTRING_PTR(RETS(ts)->rtext);
771
+ Check_Type(RETS(ts)->regex, T_REGEXP);
772
+ ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
773
+ rtext_ptr, rtext_len,
774
+ RETS(ts)->curr_ind, rtext_len - RETS(ts)->curr_ind,
775
+ &regs);
776
+
777
+ if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
778
+ if (ret < 0) return NULL; /* not matched */
779
+
780
+ beg = regs.beg[0];
781
+ RETS(ts)->curr_ind = end = regs.end[0];
782
+ if (NIL_P(RETS(ts)->proc)) {
783
+ return tk_set(&(CachedTS(ts)->token), rtext_ptr + beg, end - beg,
784
+ beg, end, 1);
785
+ } else {
786
+ VALUE rtok = rb_str_new(rtext_ptr + beg, end - beg);
787
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
788
+ return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
789
+ RSTRING_LEN(rtok), beg, end, 1);
790
+ }
791
+ }
792
+
793
+ #endif
794
+
795
+
796
+ static TokenStream *
797
+ rets_reset(TokenStream *ts, char *text)
798
+ {
799
+ RETS(ts)->rtext = rb_str_new2(text);
800
+ RETS(ts)->curr_ind = 0;
801
+ return ts;
802
+ }
803
+
804
+ static TokenStream *
805
+ rets_clone_i(TokenStream *orig_ts)
806
+ {
807
+ TokenStream *ts = ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
808
+ return ts;
809
+ }
810
+
811
+ static TokenStream *
812
+ rets_new(VALUE rtext, VALUE regex, VALUE proc)
813
+ {
814
+ TokenStream *ts = ts_new(RegExpTokenStream);
815
+
816
+ if (rtext != Qnil) {
817
+ rtext = StringValue(rtext);
818
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
819
+ }
820
+ ts->reset = &rets_reset;
821
+ ts->next = &rets_next;
822
+ ts->clone_i = &rets_clone_i;
823
+ ts->destroy_i = &rets_destroy_i;
824
+
825
+ RETS(ts)->curr_ind = 0;
826
+ RETS(ts)->rtext = rtext;
827
+ RETS(ts)->proc = proc;
828
+
829
+ if (NIL_P(regex)) {
830
+ RETS(ts)->regex = rtoken_re;
831
+ } else {
832
+ Check_Type(regex, T_REGEXP);
833
+ RETS(ts)->regex = regex;
834
+ }
835
+
836
+ return ts;
837
+ }
838
+
839
+ /*
840
+ * call-seq:
841
+ * RegExpTokenizer.new(input, /[[:alpha:]]+/)
842
+ *
843
+ * Create a new tokenizer based on a regular expression
844
+ *
845
+ * input:: text to tokenizer
846
+ * regexp:: regular expression used to recognize tokens in the input
847
+ */
848
+ static VALUE
849
+ frt_rets_init(int argc, VALUE *argv, VALUE self)
850
+ {
851
+ VALUE rtext, regex, proc;
852
+ TokenStream *ts;
853
+
854
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
855
+
856
+ ts = rets_new(rtext, regex, proc);
857
+
858
+ Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
859
+ object_add(ts, self);
860
+ return self;
861
+ }
862
+
863
+ /****************************************************************************
864
+ * Tokenizers
865
+ ****************************************************************************/
866
+
867
+ #define TS_ARGS(dflt) \
868
+ bool lower;\
869
+ VALUE rlower, rstr;\
870
+ rb_scan_args(argc, argv, "11", &rstr, &rlower);\
871
+ lower = (argc ? RTEST(rlower) : dflt)
872
+
873
+ /*
874
+ * call-seq:
875
+ * AsciiLetterTokenizer.new() -> tokenizer
876
+ *
877
+ * Create a new AsciiLetterTokenizer
878
+ */
879
+ static VALUE
880
+ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
881
+ {
882
+ return get_wrapped_ts(self, rstr, letter_tokenizer_new());
883
+ }
884
+
885
+ /*
886
+ * call-seq:
887
+ * LetterTokenizer.new(lower = true) -> tokenizer
888
+ *
889
+ * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
890
+ * is done according the current locale.
891
+ *
892
+ * lower:: set to false if you don't wish to downcase tokens
893
+ */
894
+ static VALUE
895
+ frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
896
+ {
897
+ TS_ARGS(false);
898
+ #ifndef POSH_OS_WIN32
899
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
900
+ #endif
901
+ return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
902
+ }
903
+
904
+ /*
905
+ * call-seq:
906
+ * AsciiWhiteSpaceTokenizer.new() -> tokenizer
907
+ *
908
+ * Create a new AsciiWhiteSpaceTokenizer
909
+ */
910
+ static VALUE
911
+ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
912
+ {
913
+ return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
914
+ }
915
+
916
+ /*
917
+ * call-seq:
918
+ * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
919
+ *
920
+ * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
921
+ * Downcasing is done according the current locale.
922
+ *
923
+ * lower:: set to false if you don't wish to downcase tokens
924
+ */
925
+ static VALUE
926
+ frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
927
+ {
928
+ TS_ARGS(false);
929
+ #ifndef POSH_OS_WIN32
930
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
931
+ #endif
932
+ return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
933
+ }
934
+
935
+ /*
936
+ * call-seq:
937
+ * AsciiStandardTokenizer.new() -> tokenizer
938
+ *
939
+ * Create a new AsciiStandardTokenizer
940
+ */
941
+ static VALUE
942
+ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
943
+ {
944
+ return get_wrapped_ts(self, rstr, standard_tokenizer_new());
945
+ }
946
+
947
+ /*
948
+ * call-seq:
949
+ * StandardTokenizer.new(lower = true) -> tokenizer
950
+ *
951
+ * Create a new StandardTokenizer which optionally downcases tokens.
952
+ * Downcasing is done according the current locale.
953
+ *
954
+ * lower:: set to false if you don't wish to downcase tokens
955
+ */
956
+ static VALUE
957
+ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
958
+ {
959
+ #ifndef POSH_OS_WIN32
960
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
961
+ #endif
962
+ return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
963
+ }
964
+
965
+ /****************************************************************************
966
+ * Filters
967
+ ****************************************************************************/
968
+
969
+
970
+ /*
971
+ * call-seq:
972
+ * AsciiLowerCaseFilter.new(token_stream) -> token_stream
973
+ *
974
+ * Create an AsciiLowerCaseFilter which normalizes a token's text to
975
+ * lowercase but only for ASCII characters. For other characters use
976
+ * LowerCaseFilter.
977
+ */
978
+ static VALUE
979
+ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
980
+ {
981
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
982
+ ts = lowercase_filter_new(ts);
983
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
984
+
985
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
986
+ object_add(ts, self);
987
+ return self;
988
+ }
989
+
990
+ /*
991
+ * call-seq:
992
+ * LowerCaseFilter.new(token_stream) -> token_stream
993
+ *
994
+ * Create an LowerCaseFilter which normalizes a token's text to
995
+ * lowercase based on the current locale.
996
+ */
997
+ static VALUE
998
+ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
999
+ {
1000
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
1001
+ #ifndef POSH_OS_WIN32
1002
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1003
+ #endif
1004
+ ts = mb_lowercase_filter_new(ts);
1005
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1006
+
1007
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1008
+ object_add(ts, self);
1009
+ return self;
1010
+ }
1011
+
1012
+ /*
1013
+ * call-seq:
1014
+ * HyphenFilter.new(token_stream) -> token_stream
1015
+ *
1016
+ * Create an HyphenFilter which filters hyphenated words. The way it works is
1017
+ * by adding both the word concatenated into a single word and split into
1018
+ * multiple words. ie "e-mail" becomes "email" and "e mail". This way a
1019
+ * search for "e-mail", "email" and "mail" will all match. This filter is
1020
+ * used by default by the StandardAnalyzer.
1021
+ */
1022
+ static VALUE
1023
+ frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
1024
+ {
1025
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
1026
+ ts = hyphen_filter_new(ts);
1027
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1028
+
1029
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1030
+ object_add(ts, self);
1031
+ return self;
1032
+ }
1033
+
1034
+ /*
1035
+ * call-seq:
1036
+ * StopFilter.new(token_stream) -> token_stream
1037
+ * StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
1038
+ *
1039
+ * Create an StopFilter which removes *stop-words* from a TokenStream. You can
1040
+ * optionally specify the stopwords you wish to have removed.
1041
+ *
1042
+ * token_stream:: TokenStream to be filtered
1043
+ * stop_words:: Array of *stop-words* you wish to be filtered out. This
1044
+ * defaults to a list of English stop-words. The
1045
+ * Ferret::Analysis contains a number of stop-word lists.
1046
+ */
1047
+ static VALUE
1048
+ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
1049
+ {
1050
+ VALUE rsub_ts, rstop_words;
1051
+ TokenStream *ts;
1052
+ rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
1053
+ ts = frt_get_cwrapped_rts(rsub_ts);
1054
+ if (rstop_words != Qnil) {
1055
+ char **stop_words = get_stopwords(rstop_words);
1056
+ ts = stop_filter_new_with_words(ts, (const char **)stop_words);
1057
+
1058
+ free(stop_words);
1059
+ } else {
1060
+ ts = stop_filter_new(ts);
1061
+ }
1062
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1063
+
1064
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1065
+ object_add(ts, self);
1066
+ return self;
1067
+ }
1068
+
1069
+ static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
1070
+ {
1071
+ switch (TYPE(from)) {
1072
+ case T_STRING:
1073
+ mapping_filter_add(mf, rs2s(from), to);
1074
+ break;
1075
+ case T_SYMBOL:
1076
+ mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
1077
+ break;
1078
+ default:
1079
+ rb_raise(rb_eArgError,
1080
+ "cannot map from %s with MappingFilter",
1081
+ rs2s(rb_obj_as_string(from)));
1082
+ break;
1083
+ }
1084
+ }
1085
+
1086
+ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1087
+ {
1088
+ if (key == Qundef) {
1089
+ return ST_CONTINUE;
1090
+ } else {
1091
+ TokenStream *mf = (TokenStream *)arg;
1092
+ char *to;
1093
+ switch (TYPE(value)) {
1094
+ case T_STRING:
1095
+ to = rs2s(value);
1096
+ break;
1097
+ case T_SYMBOL:
1098
+ to = rb_id2name(SYM2ID(value));
1099
+ break;
1100
+ default:
1101
+ rb_raise(rb_eArgError,
1102
+ "cannot map to %s with MappingFilter",
1103
+ rs2s(rb_obj_as_string(key)));
1104
+ break;
1105
+ }
1106
+ if (TYPE(key) == T_ARRAY) {
1107
+ int i;
1108
+ for (i = RARRAY_LEN(key) - 1; i >= 0; i--) {
1109
+ frt_add_mapping_i(mf, RARRAY_PTR(key)[i], to);
1110
+ }
1111
+ }
1112
+ else {
1113
+ frt_add_mapping_i(mf, key, to);
1114
+ }
1115
+ }
1116
+ return ST_CONTINUE;
1117
+ }
1118
+
1119
+
1120
+ /*
1121
+ * call-seq:
1122
+ * MappingFilter.new(token_stream, mapping) -> token_stream
1123
+ *
1124
+ * Create an MappingFilter which maps strings in tokens. This is usually used
1125
+ * to map UTF-8 characters to ASCII characters for easier searching and
1126
+ * better search recall. The mapping is compiled into a Deterministic Finite
1127
+ * Automata so it is super fast. This Filter can therefor be used for
1128
+ * indexing very large datasets. Currently regular expressions are not
1129
+ * supported. If you are really interested in the feature, please contact me
1130
+ * at dbalmain@gmail.com.
1131
+ *
1132
+ * token_stream:: TokenStream to be filtered
1133
+ * mapping:: Hash of mappings to apply to tokens. The key can be a
1134
+ * String or an Array of Strings. The value must be a String
1135
+ *
1136
+ * == Example
1137
+ *
1138
+ * filt = MappingFilter.new(token_stream,
1139
+ * {
1140
+ * ['à','á','â','ã','ä','å'] => 'a',
1141
+ * ['è','é','ê','ë','ē','ę'] => 'e'
1142
+ * })
1143
+ */
1144
+ static VALUE
1145
+ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1146
+ {
1147
+ TokenStream *ts;
1148
+ ts = frt_get_cwrapped_rts(rsub_ts);
1149
+ ts = mapping_filter_new(ts);
1150
+ rb_hash_foreach(mapping, frt_add_mappings_i, (VALUE)ts);
1151
+ mulmap_compile(((MappingFilter *)ts)->mapper);
1152
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1153
+
1154
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1155
+ object_add(ts, self);
1156
+ return self;
1157
+ }
1158
+
1159
+ /*
1160
+ * call-seq:
1161
+ * StemFilter.new(token_stream) -> token_stream
1162
+ * StemFilter.new(token_stream,
1163
+ * algorithm="english",
1164
+ * encoding="UTF-8") -> token_stream
1165
+ *
1166
+ * Create an StemFilter which uses a snowball stemmer (thank you Martin
1167
+ * Porter) to stem words. You can optionally specify the algorithm (default:
1168
+ * "english") and encoding (default: "UTF-8").
1169
+ *
1170
+ * token_stream:: TokenStream to be filtered
1171
+ * algorithm:: The algorithm (or language) to use
1172
+ * encoding:: The encoding of the data (default: "UTF-8")
1173
+ */
1174
+ static VALUE
1175
+ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
1176
+ {
1177
+ VALUE rsub_ts, ralgorithm, rcharenc;
1178
+ char *algorithm = "english";
1179
+ char *charenc = NULL;
1180
+ TokenStream *ts;
1181
+ rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
1182
+ ts = frt_get_cwrapped_rts(rsub_ts);
1183
+ switch (argc) {
1184
+ case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
1185
+ case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
1186
+ }
1187
+ ts = stem_filter_new(ts, algorithm, charenc);
1188
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1189
+
1190
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1191
+ object_add(ts, self);
1192
+ return self;
1193
+ }
1194
+
1195
+ /****************************************************************************
1196
+ *
1197
+ * Analyzer Methods
1198
+ *
1199
+ ****************************************************************************/
1200
+
1201
+ /****************************************************************************
1202
+ * CWrappedAnalyzer Methods
1203
+ ****************************************************************************/
1204
+
1205
+ #define GET_A(a, self) Data_Get_Struct(self, Analyzer, a)
1206
+
1207
+ #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
1208
+ typedef struct CWrappedAnalyzer
1209
+ {
1210
+ Analyzer super;
1211
+ VALUE ranalyzer;
1212
+ } CWrappedAnalyzer;
1213
+
1214
+ static void
1215
+ cwa_destroy_i(Analyzer *a)
1216
+ {
1217
+ rb_hash_delete(object_space, ((VALUE)a)|1);
1218
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
1219
+ free(a);
1220
+ }
1221
+
1222
+ static TokenStream *
1223
+ cwa_get_ts(Analyzer *a, char *field, char *text)
1224
+ {
1225
+ VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1226
+ ID2SYM(rb_intern(field)), rb_str_new2(text));
1227
+ return frt_get_cwrapped_rts(rts);
1228
+ }
1229
+
1230
+ Analyzer *
1231
+ frt_get_cwrapped_analyzer(VALUE ranalyzer)
1232
+ {
1233
+ Analyzer *a = NULL;
1234
+ if (frt_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
1235
+ Data_Get_Struct(ranalyzer, Analyzer, a);
1236
+ REF(a);
1237
+ }
1238
+ else {
1239
+ a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1240
+ a->destroy_i = &cwa_destroy_i;
1241
+ a->get_ts = &cwa_get_ts;
1242
+ a->ref_cnt = 1;
1243
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1244
+ /* prevent from being garbage collected */
1245
+ rb_hash_aset(object_space, ((VALUE)a)|1, ranalyzer);
1246
+ }
1247
+ return a;
1248
+ }
1249
+
1250
+ static void
1251
+ frt_analyzer_free(Analyzer *a)
1252
+ {
1253
+ object_del(a);
1254
+ a_deref(a);
1255
+ }
1256
+
1257
+ VALUE
1258
+ frt_get_analyzer(Analyzer *a)
1259
+ {
1260
+ VALUE self = Qnil;
1261
+ if (a) {
1262
+ self = object_get(a);
1263
+ if (self == Qnil) {
1264
+ self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
1265
+ REF(a);
1266
+ object_add(a, self);
1267
+ }
1268
+ }
1269
+ return self;
1270
+ }
1271
+
1272
+ INLINE VALUE
1273
+ get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
1274
+ {
1275
+ TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
1276
+
1277
+ /* Make sure that there is no entry already */
1278
+ object_set(&ts->text, rstring);
1279
+ return get_rb_token_stream(ts);
1280
+ }
1281
+
1282
+ /*
1283
+ * call-seq:
1284
+ * analyzer.token_stream(field_name, input) -> token_stream
1285
+ *
1286
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1287
+ * also depend on the +field_name+. Although this parameter is typically
1288
+ * ignored.
1289
+ *
1290
+ * field_name:: name of the field to be tokenized
1291
+ * input:: data from the field to be tokenized
1292
+ */
1293
+ static VALUE
1294
+ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1295
+ {
1296
+ /* NOTE: Any changes made to this method may also need to be applied to
1297
+ * frt_re_analyzer_token_stream */
1298
+ Analyzer *a;
1299
+ GET_A(a, self);
1300
+
1301
+ StringValue(rstring);
1302
+
1303
+ return get_rb_ts_from_a(a, rfield, rstring);
1304
+ }
1305
+
1306
+ #define GET_LOWER(dflt) \
1307
+ bool lower;\
1308
+ VALUE rlower;\
1309
+ rb_scan_args(argc, argv, "01", &rlower);\
1310
+ lower = (argc ? RTEST(rlower) : dflt)
1311
+
1312
+ /*
1313
+ * call-seq:
1314
+ * AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
1315
+ *
1316
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1317
+ * but can optionally leave case as is. Lowercasing will only be done to
1318
+ * ASCII characters.
1319
+ *
1320
+ * lower:: set to false if you don't want the field's tokens to be downcased
1321
+ */
1322
+ static VALUE
1323
+ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1324
+ {
1325
+ Analyzer *a;
1326
+ GET_LOWER(false);
1327
+ a = whitespace_analyzer_new(lower);
1328
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1329
+ object_add(a, self);
1330
+ return self;
1331
+ }
1332
+
1333
+ /*
1334
+ * call-seq:
1335
+ * WhiteSpaceAnalyzer.new(lower = false) -> analyzer
1336
+ *
1337
+ * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1338
+ * optionally leave case as is. Lowercasing will be done based on the current
1339
+ * locale.
1340
+ *
1341
+ * lower:: set to false if you don't want the field's tokens to be downcased
1342
+ */
1343
+ static VALUE
1344
+ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1345
+ {
1346
+ Analyzer *a;
1347
+ GET_LOWER(false);
1348
+ #ifndef POSH_OS_WIN32
1349
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1350
+ #endif
1351
+ a = mb_whitespace_analyzer_new(lower);
1352
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1353
+ object_add(a, self);
1354
+ return self;
1355
+ }
1356
+
1357
+ /*
1358
+ * call-seq:
1359
+ * AsciiLetterAnalyzer.new(lower = true) -> analyzer
1360
+ *
1361
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1362
+ * but can optionally leave case as is. Lowercasing will only be done to
1363
+ * ASCII characters.
1364
+ *
1365
+ * lower:: set to false if you don't want the field's tokens to be downcased
1366
+ */
1367
+ static VALUE
1368
+ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1369
+ {
1370
+ Analyzer *a;
1371
+ GET_LOWER(true);
1372
+ a = letter_analyzer_new(lower);
1373
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1374
+ object_add(a, self);
1375
+ return self;
1376
+ }
1377
+
1378
+ /*
1379
+ * call-seq:
1380
+ * LetterAnalyzer.new(lower = true) -> analyzer
1381
+ *
1382
+ * Create a new LetterAnalyzer which downcases tokens by default but can
1383
+ * optionally leave case as is. Lowercasing will be done based on the current
1384
+ * locale.
1385
+ *
1386
+ * lower:: set to false if you don't want the field's tokens to be downcased
1387
+ */
1388
+ static VALUE
1389
+ frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1390
+ {
1391
+ Analyzer *a;
1392
+ GET_LOWER(true);
1393
+ #ifndef POSH_OS_WIN32
1394
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1395
+ #endif
1396
+ a = mb_letter_analyzer_new(lower);
1397
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1398
+ object_add(a, self);
1399
+ return self;
1400
+ }
1401
+
1402
+ static VALUE
1403
+ get_rstopwords(const char **stop_words)
1404
+ {
1405
+ char **w = (char **)stop_words;
1406
+ VALUE rstopwords = rb_ary_new();
1407
+
1408
+ while (*w) {
1409
+ rb_ary_push(rstopwords, rb_str_new2(*w));
1410
+ w++;
1411
+ }
1412
+ return rstopwords;
1413
+ }
1414
+
1415
+ /*
1416
+ * call-seq:
1417
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
1418
+ * -> analyzer
1419
+ *
1420
+ * Create a new AsciiStandardAnalyzer which downcases tokens by default but
1421
+ * can optionally leave case as is. Lowercasing will be done based on the
1422
+ * current locale. You can also set the list of stop-words to be used by the
1423
+ * StopFilter.
1424
+ *
1425
+ * lower:: set to false if you don't want the field's tokens to be downcased
1426
+ * stop_words:: list of stop-words to pass to the StopFilter
1427
+ */
1428
+ static VALUE
1429
+ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1430
+ {
1431
+ bool lower;
1432
+ VALUE rlower, rstop_words;
1433
+ Analyzer *a;
1434
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1435
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1436
+ if (rstop_words != Qnil) {
1437
+ char **stop_words = get_stopwords(rstop_words);
1438
+ a = standard_analyzer_new_with_words((const char **)stop_words, lower);
1439
+ free(stop_words);
1440
+ } else {
1441
+ a = standard_analyzer_new(lower);
1442
+ }
1443
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1444
+ object_add(a, self);
1445
+ return self;
1446
+ }
1447
+
1448
+ /*
1449
+ * call-seq:
1450
+ * StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
1451
+ * -> analyzer
1452
+ *
1453
+ * Create a new StandardAnalyzer which downcases tokens by default but can
1454
+ * optionally leave case as is. Lowercasing will be done based on the current
1455
+ * locale. You can also set the list of stop-words to be used by the
1456
+ * StopFilter.
1457
+ *
1458
+ * lower:: set to false if you don't want the field's tokens to be downcased
1459
+ * stop_words:: list of stop-words to pass to the StopFilter
1460
+ */
1461
+ static VALUE
1462
+ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1463
+ {
1464
+ bool lower;
1465
+ VALUE rlower, rstop_words;
1466
+ Analyzer *a;
1467
+ #ifndef POSH_OS_WIN32
1468
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1469
+ #endif
1470
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1471
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1472
+ if (rstop_words != Qnil) {
1473
+ char **stop_words = get_stopwords(rstop_words);
1474
+ a = mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
1475
+ free(stop_words);
1476
+ } else {
1477
+ a = mb_standard_analyzer_new(lower);
1478
+ }
1479
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1480
+ object_add(a, self);
1481
+ return self;
1482
+ }
1483
+
1484
+ static void
1485
+ frt_h_mark_values_i(void *key, void *value, void *arg)
1486
+ {
1487
+ frt_gc_mark(value);
1488
+ }
1489
+
1490
+ static void
1491
+ frt_pfa_mark(void *p)
1492
+ {
1493
+ frt_gc_mark(PFA(p)->default_a);
1494
+ h_each(PFA(p)->dict, &frt_h_mark_values_i, NULL);
1495
+ }
1496
+
1497
+ /*** PerFieldAnalyzer ***/
1498
+
1499
+ /*
1500
+ * call-seq:
1501
+ * PerFieldAnalyzer.new(default_analyzer) -> analyzer
1502
+ *
1503
+ * Create a new PerFieldAnalyzer specifying the default analyzer to use on
1504
+ * all fields that are set specifically.
1505
+ *
1506
+ * default_analyzer:: analyzer to be used on fields that aren't otherwise
1507
+ * specified
1508
+ */
1509
+ static VALUE
1510
+ frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1511
+ {
1512
+ Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
1513
+ Analyzer *a = per_field_analyzer_new(def);
1514
+ Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
1515
+ object_add(a, self);
1516
+ return self;
1517
+ }
1518
+
1519
+ /*
1520
+ * call-seq:
1521
+ * per_field_analyzer.add_field(field_name, default_analyzer) -> self
1522
+ * per_field_analyzer[field_name] = default_analyzer -> self
1523
+ *
1524
+ * Set the analyzer to be used on field +field_name+. Note that field_name
1525
+ * should be a symbol.
1526
+ *
1527
+ * field_name:: field we wish to set the analyzer for
1528
+ * analyzer:: analyzer to be used on +field_name+
1529
+ */
1530
+ static VALUE
1531
+ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1532
+ {
1533
+ Analyzer *pfa, *a;
1534
+ Data_Get_Struct(self, Analyzer, pfa);
1535
+ a = frt_get_cwrapped_analyzer(ranalyzer);
1536
+
1537
+ pfa_add_field(pfa, frt_field(rfield), a);
1538
+ return self;
1539
+ }
1540
+
1541
+ /*
1542
+ * call-seq:
1543
+ * analyzer.token_stream(field_name, input) -> token_stream
1544
+ *
1545
+ * Create a new TokenStream to tokenize +input+. The TokenStream created will
1546
+ * also depend on the +field_name+ in the case of the PerFieldAnalyzer.
1547
+ *
1548
+ * field_name:: name of the field to be tokenized
1549
+ * input:: data from the field to be tokenized
1550
+ */
1551
+ static VALUE
1552
+ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1553
+ {
1554
+ Analyzer *pfa, *a;
1555
+ char *field = frt_field(rfield);
1556
+ GET_A(pfa, self);
1557
+
1558
+ StringValue(rstring);
1559
+ a = (Analyzer *)h_get(PFA(pfa)->dict, field);
1560
+ if (a == NULL) {
1561
+ a = PFA(pfa)->default_a;
1562
+ }
1563
+ if (a->get_ts == cwa_get_ts) {
1564
+ return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1565
+ ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
1566
+ }
1567
+ else {
1568
+ return get_rb_ts_from_a(a, rfield, rstring);
1569
+ }
1570
+ }
1571
+
1572
+ /*** RegExpAnalyzer ***/
1573
+
1574
+ static void
1575
+ frt_re_analyzer_mark(Analyzer *a)
1576
+ {
1577
+ frt_gc_mark(a->current_ts);
1578
+ }
1579
+
1580
+ static void
1581
+ re_analyzer_destroy_i(Analyzer *a)
1582
+ {
1583
+ ts_deref(a->current_ts);
1584
+ free(a);
1585
+ }
1586
+
1587
+ /*
1588
+ * call-seq:
1589
+ * RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
1590
+ *
1591
+ * Create a new RegExpAnalyzer which will create tokenizers based on the
1592
+ * regular expression and lowercasing if required.
1593
+ *
1594
+ * reg_exp:: the token matcher for the tokenizer to use
1595
+ * lower:: set to false if you don't want to downcase the tokens
1596
+ */
1597
+ static VALUE
1598
+ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1599
+ {
1600
+ VALUE lower, rets, regex, proc;
1601
+ Analyzer *a;
1602
+ TokenStream *ts;
1603
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1604
+
1605
+ ts = rets_new(Qnil, regex, proc);
1606
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1607
+ object_add(ts, rets);
1608
+
1609
+ if (lower != Qfalse) {
1610
+ rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
1611
+ ts = DATA_PTR(rets);
1612
+ }
1613
+ REF(ts);
1614
+
1615
+ a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1616
+ Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
1617
+ object_add(a, self);
1618
+ return self;
1619
+ }
1620
+
1621
+ /*
1622
+ * call-seq:
1623
+ * analyzer.token_stream(field_name, input) -> token_stream
1624
+ *
1625
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1626
+ * also depend on the +field_name+. Although this parameter is typically
1627
+ * ignored.
1628
+ *
1629
+ * field_name:: name of the field to be tokenized
1630
+ * input:: data from the field to be tokenized
1631
+ */
1632
+ static VALUE
1633
+ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1634
+ {
1635
+ TokenStream *ts;
1636
+ Analyzer *a;
1637
+ GET_A(a, self);
1638
+
1639
+ StringValue(rtext);
1640
+
1641
+ ts = a_get_ts(a, frt_field(rfield), rs2s(rtext));
1642
+
1643
+ /* Make sure that there is no entry already */
1644
+ object_set(&ts->text, rtext);
1645
+ if (ts->next == &rets_next) {
1646
+ RETS(ts)->rtext = rtext;
1647
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
1648
+ }
1649
+ else {
1650
+ RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
1651
+ rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext);
1652
+ }
1653
+ return get_rb_token_stream(ts);
1654
+ }
1655
+
1656
+ /****************************************************************************
1657
+ *
1658
+ * Locale stuff
1659
+ *
1660
+ ****************************************************************************/
1661
+
1662
+ /*
1663
+ * call-seq:
1664
+ * Ferret.locale -> locale_str
1665
+ *
1666
+ * Returns a string corresponding to the locale set. For example;
1667
+ *
1668
+ * puts Ferret.locale #=> "en_US.UTF-8"
1669
+ */
1670
+ static VALUE frt_get_locale(VALUE self, VALUE locale)
1671
+ {
1672
+ return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
1673
+ }
1674
+
1675
+ /*
1676
+ * call-seq:
1677
+ * Ferret.locale = "en_US.UTF-8"
1678
+ *
1679
+ * Set the global locale. You should use this method to set different locales
1680
+ * when indexing documents with different encodings.
1681
+ */
1682
+ static VALUE frt_set_locale(VALUE self, VALUE locale)
1683
+ {
1684
+ char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
1685
+ frt_locale = setlocale(LC_CTYPE, l);
1686
+ return frt_locale ? rb_str_new2(frt_locale) : Qnil;
1687
+ }
1688
+
1689
+ /****************************************************************************
1690
+ *
1691
+ * Init Functions
1692
+ *
1693
+ ****************************************************************************/
1694
+
1695
+ /*
1696
+ * Document-class: Ferret::Analysis::Token
1697
+ *
1698
+ * == Summary
1699
+ *
1700
+ * A Token is an occurrence of a term from the text of a field. It consists
1701
+ * of a term's text and the start and end offset of the term in the text of
1702
+ * the field;
1703
+ *
1704
+ * The start and end offsets permit applications to re-associate a token with
1705
+ * its source text, e.g., to display highlighted query terms in a document
1706
+ * browser, or to show matching text fragments in a KWIC (KeyWord In Context)
1707
+ * display, etc.
1708
+ *
1709
+ * === Attributes
1710
+ *
1711
+ * text:: the terms text which may have been modified by a Token Filter or
1712
+ * Tokenizer from the text originally found in the document
1713
+ * start:: is the position of the first character corresponding to
1714
+ * this token in the source text
1715
+ * end:: is equal to one greater than the position of the last
1716
+ * character corresponding of this token Note that the
1717
+ * difference between @end_offset and @start_offset may not be
1718
+ * equal to @text.length(), as the term text may have been
1719
+ * altered by a stemmer or some other filter.
1720
+ */
1721
+ static void Init_Token(void)
1722
+ {
1723
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1724
+ rb_define_alloc_func(cToken, frt_token_alloc);
1725
+ rb_include_module(cToken, rb_mComparable);
1726
+
1727
+ rb_define_method(cToken, "initialize", frt_token_init, -1);
1728
+ rb_define_method(cToken, "<=>", frt_token_cmp, 1);
1729
+ rb_define_method(cToken, "text", frt_token_get_text, 0);
1730
+ rb_define_method(cToken, "text=", frt_token_set_text, 1);
1731
+ rb_define_method(cToken, "start", frt_token_get_start_offset, 0);
1732
+ rb_define_method(cToken, "start=", frt_token_set_start_offset, 1);
1733
+ rb_define_method(cToken, "end", frt_token_get_end_offset, 0);
1734
+ rb_define_method(cToken, "end=", frt_token_set_end_offset, 1);
1735
+ rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
1736
+ rb_define_method(cToken, "pos_inc=", frt_token_set_pos_inc, 1);
1737
+ rb_define_method(cToken, "to_s", frt_token_to_s, 0);
1738
+ }
1739
+
1740
+ /*
1741
+ * Document-class: Ferret::Analysis::TokenStream
1742
+ *
1743
+ * A TokenStream enumerates the sequence of tokens, either from
1744
+ * fields of a document or from query text.
1745
+ *
1746
+ * This is an abstract class. Concrete subclasses are:
1747
+ *
1748
+ * Tokenizer:: a TokenStream whose input is a string
1749
+ * TokenFilter:: a TokenStream whose input is another TokenStream
1750
+ */
1751
+ static void Init_TokenStream(void)
1752
+ {
1753
+ cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1754
+ frt_mark_cclass(cTokenStream);
1755
+ rb_define_method(cTokenStream, "next", frt_ts_next, 0);
1756
+ rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
1757
+ rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
1758
+ }
1759
+
1760
+ /*
1761
+ * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1762
+ *
1763
+ * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1764
+ * That is to say, it defines tokens as maximal strings of adjacent letters,
1765
+ * as defined by the regular expression _/[A-Za-z]+/_.
1766
+ *
1767
+ * === Example
1768
+ *
1769
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1770
+ * => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
1771
+ */
1772
+ static void Init_AsciiLetterTokenizer(void)
1773
+ {
1774
+ cAsciiLetterTokenizer =
1775
+ rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1776
+ frt_mark_cclass(cAsciiLetterTokenizer);
1777
+ rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
1778
+ rb_define_method(cAsciiLetterTokenizer, "initialize",
1779
+ frt_a_letter_tokenizer_init, 1);
1780
+ }
1781
+
1782
+ /*
1783
+ * Document-class: Ferret::Analysis::LetterTokenizer
1784
+ *
1785
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That is
1786
+ * to say, it defines tokens as maximal strings of adjacent letters, as
1787
+ * defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
1788
+ * all characters in your local locale.
1789
+ *
1790
+ * === Example
1791
+ *
1792
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1793
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1794
+ */
1795
+ static void Init_LetterTokenizer(void)
1796
+ {
1797
+ cLetterTokenizer =
1798
+ rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1799
+ frt_mark_cclass(cLetterTokenizer);
1800
+ rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
1801
+ rb_define_method(cLetterTokenizer, "initialize",
1802
+ frt_letter_tokenizer_init, -1);
1803
+ }
1804
+
1805
+ /*
1806
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1807
+ *
1808
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1809
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1810
+ *
1811
+ * === Example
1812
+ *
1813
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1814
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1815
+ */
1816
+ static void Init_AsciiWhiteSpaceTokenizer(void)
1817
+ {
1818
+ cAsciiWhiteSpaceTokenizer =
1819
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1820
+ cTokenStream);
1821
+ frt_mark_cclass(cAsciiWhiteSpaceTokenizer);
1822
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
1823
+ rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1824
+ frt_a_whitespace_tokenizer_init, 1);
1825
+ }
1826
+
1827
+ /*
1828
+ * Document-class: Ferret::Analysis::WhiteSpaceTokenizer
1829
+ *
1830
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1831
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1832
+ *
1833
+ * === Example
1834
+ *
1835
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1836
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1837
+ */
1838
+ static void Init_WhiteSpaceTokenizer(void)
1839
+ {
1840
+ cWhiteSpaceTokenizer =
1841
+ rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1842
+ frt_mark_cclass(cWhiteSpaceTokenizer);
1843
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1844
+ rb_define_method(cWhiteSpaceTokenizer, "initialize",
1845
+ frt_whitespace_tokenizer_init, -1);
1846
+ }
1847
+
1848
+ /*
1849
+ * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1850
+ *
1851
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1852
+ * words correctly as well as tokenizing things like email addresses, web
1853
+ * addresses, phone numbers, etc.
1854
+ *
1855
+ * === Example
1856
+ *
1857
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1858
+ * => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
1859
+ */
1860
+ static void Init_AsciiStandardTokenizer(void)
1861
+ {
1862
+ cAsciiStandardTokenizer =
1863
+ rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1864
+ frt_mark_cclass(cAsciiStandardTokenizer);
1865
+ rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1866
+ rb_define_method(cAsciiStandardTokenizer, "initialize",
1867
+ frt_a_standard_tokenizer_init, 1);
1868
+ }
1869
+
1870
+ /*
1871
+ * Document-class: Ferret::Analysis::StandardTokenizer
1872
+ *
1873
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1874
+ * words correctly as well as tokenizing things like email addresses, web
1875
+ * addresses, phone numbers, etc.
1876
+ *
1877
+ * === Example
1878
+ *
1879
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1880
+ * => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
1881
+ */
1882
+ static void Init_StandardTokenizer(void)
1883
+ {
1884
+ cStandardTokenizer =
1885
+ rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1886
+ frt_mark_cclass(cStandardTokenizer);
1887
+ rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1888
+ rb_define_method(cStandardTokenizer, "initialize",
1889
+ frt_standard_tokenizer_init, 1);
1890
+ }
1891
+
1892
+ /*
1893
+ * Document-class: Ferret::Analysis::RegExpTokenizer
1894
+ *
1895
+ * A tokenizer that recognizes tokens based on a regular expression passed to
1896
+ * the constructor. Most possible tokenizers can be created using this class.
1897
+ *
1898
+ * === Example
1899
+ *
1900
+ * Below is an example of a simple implementation of a LetterTokenizer using
1901
+ * an RegExpTokenizer. Basically, a token is a sequence of alphabetic
1902
+ * characters separated by one or more non-alphabetic characters.
1903
+ *
1904
+ * # of course you would add more than just é
1905
+ * RegExpTokenizer.new(input, /[[:alpha:]é]+/)
1906
+ *
1907
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1908
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1909
+ */
1910
+ static void Init_RegExpTokenizer(void)
1911
+ {
1912
+ cRegExpTokenizer =
1913
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1914
+ frt_mark_cclass(cRegExpTokenizer);
1915
+ rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1916
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1917
+ rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1918
+ rb_define_method(cRegExpTokenizer, "initialize",
1919
+ frt_rets_init, -1);
1920
+ rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1921
+ rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1922
+ }
1923
+
1924
+ /***************/
1925
+ /*** Filters ***/
1926
+ /***************/
1927
+
1928
+ /*
1929
+ * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1930
+ *
1931
+ * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1932
+ * ASCII characters. For other characters use LowerCaseFilter.
1933
+ *
1934
+ * === Example
1935
+ *
1936
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
1937
+ *
1938
+ */
1939
+ static void Init_AsciiLowerCaseFilter(void)
1940
+ {
1941
+ cAsciiLowerCaseFilter =
1942
+ rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1943
+ frt_mark_cclass(cAsciiLowerCaseFilter);
1944
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1945
+ rb_define_method(cAsciiLowerCaseFilter, "initialize",
1946
+ frt_a_lowercase_filter_init, 1);
1947
+ }
1948
+
1949
+ /*
1950
+ * Document-class: Ferret::Analysis::LowerCaseFilter
1951
+ *
1952
+ * LowerCaseFilter normalizes a token's text to lowercase based on the
1953
+ * current locale.
1954
+ *
1955
+ * === Example
1956
+ *
1957
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
1958
+ *
1959
+ */
1960
+ static void Init_LowerCaseFilter(void)
1961
+ {
1962
+ cLowerCaseFilter =
1963
+ rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1964
+ frt_mark_cclass(cLowerCaseFilter);
1965
+ rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1966
+ rb_define_method(cLowerCaseFilter, "initialize",
1967
+ frt_lowercase_filter_init, 1);
1968
+ }
1969
+
1970
+ /*
1971
+ * Document-class: Ferret::Analysis::HyphenFilter
1972
+ *
1973
+ * HyphenFilter filters hyphenated words by adding both the word concatenated
1974
+ * into a single word and split into multiple words. ie "e-mail" becomes
1975
+ * "email" and "e mail". This way a search for "e-mail", "email" and "mail"
1976
+ * will all match. This filter is used by default by the StandardAnalyzer.
1977
+ *
1978
+ * === Example
1979
+ *
1980
+ * ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
1981
+ *
1982
+ */
1983
+ static void Init_HyphenFilter(void)
1984
+ {
1985
+ cHyphenFilter =
1986
+ rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
1987
+ frt_mark_cclass(cHyphenFilter);
1988
+ rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
1989
+ rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
1990
+ }
1991
+
1992
+ /*
1993
+ * Document-class: Ferret::Analysis::MappingFilter
1994
+ *
1995
+ * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
1996
+ * characters to ASCII characters for easier searching and better search
1997
+ * recall. The mapping is compiled into a Deterministic Finite Automata so it
1998
+ * is super fast. This Filter can therefor be used for indexing very large
1999
+ * datasets. Currently regular expressions are not supported. If you are
2000
+ * really interested in the feature, please contact me at dbalmain@gmail.com.
2001
+ *
2002
+ * == Example
2003
+ *
2004
+ * mapping = {
2005
+ * ['à','á','â','ã','ä','å','ā','ă'] => 'a',
2006
+ * 'æ' => 'ae',
2007
+ * ['ď','đ'] => 'd',
2008
+ * ['ç','ć','č','ĉ','ċ'] => 'c',
2009
+ * ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
2010
+ * ['ƒ'] => 'f',
2011
+ * ['ĝ','ğ','ġ','ģ'] => 'g',
2012
+ * ['ĥ','ħ'] => 'h',
2013
+ * ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
2014
+ * ['į','ı','ij','ĵ'] => 'j',
2015
+ * ['ķ','ĸ'] => 'k',
2016
+ * ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
2017
+ * ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
2018
+ * ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
2019
+ * ['œ'] => 'oek',
2020
+ * ['ą'] => 'q',
2021
+ * ['ŕ','ř','ŗ'] => 'r',
2022
+ * ['ś','š','ş','ŝ','ș'] => 's',
2023
+ * ['ť','ţ','ŧ','ț'] => 't',
2024
+ * ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
2025
+ * ['ŵ'] => 'w',
2026
+ * ['ý','ÿ','ŷ'] => 'y',
2027
+ * ['ž','ż','ź'] => 'z'
2028
+ * }
2029
+ * filt = MappingFilter.new(token_stream, mapping)
2030
+ */
2031
+ static void Init_MappingFilter(void)
2032
+ {
2033
+ cMappingFilter =
2034
+ rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
2035
+ frt_mark_cclass(cMappingFilter);
2036
+ rb_define_alloc_func(cMappingFilter, frt_data_alloc);
2037
+ rb_define_method(cMappingFilter, "initialize",
2038
+ frt_mapping_filter_init, 2);
2039
+ }
2040
+
2041
+ /*
2042
+ * Document-class: Ferret::Analysis::StopFilter
2043
+ *
2044
+ * A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
2045
+ * that you don't wish to be index. Usually they will be common words like
2046
+ * "the" and "and" although you can specify whichever words you want.
2047
+ *
2048
+ * === Example
2049
+ *
2050
+ * ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
2051
+ */
2052
+ static void Init_StopFilter(void)
2053
+ {
2054
+ cStopFilter =
2055
+ rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
2056
+ frt_mark_cclass(cStopFilter);
2057
+ rb_define_alloc_func(cStopFilter, frt_data_alloc);
2058
+ rb_define_method(cStopFilter, "initialize",
2059
+ frt_stop_filter_init, -1);
2060
+ }
2061
+
2062
+ /*
2063
+ * Document-class: Ferret::Analysis::StemFilter
2064
+ *
2065
+ * == Summary
2066
+ *
2067
+ * A StemFilter takes a term and transforms the term as per the SnowBall
2068
+ * stemming algorithm. Note: the input to the stemming filter must already
2069
+ * be in lower case, so you will need to use LowerCaseFilter or lowercasing
2070
+ * Tokenizer further down the Tokenizer chain in order for this to work
2071
+ * properly!
2072
+ *
2073
+ * === Available algorithms and encodings
2074
+ *
2075
+ * Algorithm Algorithm Pseudonyms Encoding
2076
+ * ----------------------------------------------------------------
2077
+ * "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
2078
+ * "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
2079
+ * "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
2080
+ * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
2081
+ * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
2082
+ * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
2083
+ * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
2084
+ * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
2085
+ * "porter", | | "ISO_8859_1", "UTF_8"
2086
+ * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
2087
+ * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
2088
+ * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
2089
+ * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
2090
+ *
2091
+ * === Example
2092
+ *
2093
+ * To use this filter with other analyzers, you'll want to write an Analyzer
2094
+ * class that sets up the TokenStream chain as you want it. To use this with
2095
+ * a lowercasing Tokenizer, for example, you'd write an analyzer like this:
2096
+ *
2097
+ * def MyAnalyzer < Analyzer
2098
+ * def token_stream(field, str)
2099
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2100
+ * end
2101
+ * end
2102
+ *
2103
+ * "debate debates debated debating debater"
2104
+ * => ["debat", "debat", "debat", "debat", "debat"]
2105
+ *
2106
+ * === Attributes
2107
+ *
2108
+ * token_stream:: TokenStream to be filtered
2109
+ * algorithm:: The algorithm (or language) to use (default: "english")
2110
+ * encoding:: The encoding of the data (default: "UTF-8")
2111
+ */
2112
+ static void Init_StemFilter(void)
2113
+ {
2114
+ cStemFilter =
2115
+ rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
2116
+ frt_mark_cclass(cStemFilter);
2117
+ rb_define_alloc_func(cStemFilter, frt_data_alloc);
2118
+ rb_define_method(cStemFilter, "initialize",
2119
+ frt_stem_filter_init, -1);
2120
+ }
2121
+
2122
+ /*************************/
2123
+ /*** * * Analyzers * * ***/
2124
+ /*************************/
2125
+
2126
+ /*
2127
+ * Document-class: Ferret::Analysis::Analyzer
2128
+ *
2129
+ * == Summary
2130
+ *
2131
+ * An Analyzer builds TokenStreams, which analyze text. It thus represents
2132
+ * a policy for extracting index terms from text.
2133
+ *
2134
+ * Typical implementations first build a Tokenizer, which breaks the stream
2135
+ * of characters from the Reader into raw Tokens. One or more TokenFilters
2136
+ * may then be applied to the output of the Tokenizer.
2137
+ *
2138
+ * The default Analyzer just creates a LowerCaseTokenizer which converts
2139
+ * all text to lowercase tokens. See LowerCaseTokenizer for more details.
2140
+ *
2141
+ * === Example
2142
+ *
2143
+ * To create your own custom Analyzer you simply need to implement a
2144
+ * token_stream method which takes the field name and the data to be
2145
+ * tokenized as parameters and returns a TokenStream. Most analyzers
2146
+ * typically ignore the field name.
2147
+ *
2148
+ * Here we'll create a StemmingAnalyzer;
2149
+ *
2150
+ * def MyAnalyzer < Analyzer
2151
+ * def token_stream(field, str)
2152
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2153
+ * end
2154
+ * end
2155
+ */
2156
+ static void Init_Analyzer(void)
2157
+ {
2158
+ cAnalyzer =
2159
+ rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
2160
+ frt_mark_cclass(cAnalyzer);
2161
+ rb_define_alloc_func(cAnalyzer, frt_data_alloc);
2162
+ rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
2163
+ rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
2164
+ }
2165
+
2166
+ /*
2167
+ * Document-class: Ferret::Analysis::AsciiLetterAnalyzer
2168
+ *
2169
+ * == Summary
2170
+ *
2171
+ * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
2172
+ * maximal strings of ASCII characters. If implemented in Ruby it would look
2173
+ * like;
2174
+ *
2175
+ * class AsciiLetterAnalyzer
2176
+ * def initialize(lower = true)
2177
+ * @lower = lower
2178
+ * end
2179
+ *
2180
+ * def token_stream(field, str)
2181
+ * if @lower
2182
+ * return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
2183
+ * else
2184
+ * return AsciiLetterTokenizer.new(str)
2185
+ * end
2186
+ * end
2187
+ * end
2188
+ *
2189
+ * As you can see it makes use of the AsciiLetterTokenizer and
2190
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
2191
+ * characters so you should use the LetterAnalyzer is you want to analyze
2192
+ * multi-byte data like "UTF-8".
2193
+ */
2194
+ static void Init_AsciiLetterAnalyzer(void)
2195
+ {
2196
+ cAsciiLetterAnalyzer =
2197
+ rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
2198
+ frt_mark_cclass(cAsciiLetterAnalyzer);
2199
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
2200
+ rb_define_method(cAsciiLetterAnalyzer, "initialize",
2201
+ frt_a_letter_analyzer_init, -1);
2202
+ }
2203
+
2204
+ /*
2205
+ * Document-class: Ferret::Analysis::LetterAnalyzer
2206
+ *
2207
+ * == Summary
2208
+ *
2209
+ * A LetterAnalyzer creates a TokenStream that splits the input up into
2210
+ * maximal strings of characters as recognized by the current locale. If
2211
+ * implemented in Ruby it would look like;
2212
+ *
2213
+ * class LetterAnalyzer
2214
+ * def initialize(lower = true)
2215
+ * @lower = lower
2216
+ * end
2217
+ *
2218
+ * def token_stream(field, str)
2219
+ * return LetterTokenizer.new(str, @lower)
2220
+ * end
2221
+ * end
2222
+ *
2223
+ * As you can see it makes use of the LetterTokenizer.
2224
+ */
2225
+ static void Init_LetterAnalyzer(void)
2226
+ {
2227
+ cLetterAnalyzer =
2228
+ rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
2229
+ frt_mark_cclass(cLetterAnalyzer);
2230
+ rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
2231
+ rb_define_method(cLetterAnalyzer, "initialize",
2232
+ frt_letter_analyzer_init, -1);
2233
+ }
2234
+
2235
+ /*
2236
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
2237
+ *
2238
+ * == Summary
2239
+ *
2240
+ * The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
2241
+ * non-whitespace characters. If implemented in Ruby the
2242
+ * AsciiWhiteSpaceAnalyzer would look like;
2243
+ *
2244
+ * class AsciiWhiteSpaceAnalyzer
2245
+ * def initialize(lower = true)
2246
+ * @lower = lower
2247
+ * end
2248
+ *
2249
+ * def token_stream(field, str)
2250
+ * if @lower
2251
+ * return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
2252
+ * else
2253
+ * return AsciiWhiteSpaceTokenizer.new(str)
2254
+ * end
2255
+ * end
2256
+ * end
2257
+ *
2258
+ * As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
2259
+ * use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
2260
+ * as "UTF-8".
2261
+ */
2262
+ static void Init_AsciiWhiteSpaceAnalyzer(void)
2263
+ {
2264
+ cAsciiWhiteSpaceAnalyzer =
2265
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
2266
+ frt_mark_cclass(cAsciiWhiteSpaceAnalyzer);
2267
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
2268
+ rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
2269
+ frt_a_white_space_analyzer_init, -1);
2270
+ }
2271
+
2272
+ /*
2273
+ * Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
2274
+ *
2275
+ * == Summary
2276
+ *
2277
+ * The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
2278
+ * non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
2279
+ * would look like;
2280
+ *
2281
+ * class WhiteSpaceAnalyzer
2282
+ * def initialize(lower = true)
2283
+ * @lower = lower
2284
+ * end
2285
+ *
2286
+ * def token_stream(field, str)
2287
+ * return WhiteSpaceTokenizer.new(str, @lower)
2288
+ * end
2289
+ * end
2290
+ *
2291
+ * As you can see it makes use of the WhiteSpaceTokenizer.
2292
+ */
2293
+ static void Init_WhiteSpaceAnalyzer(void)
2294
+ {
2295
+ cWhiteSpaceAnalyzer =
2296
+ rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
2297
+ frt_mark_cclass(cWhiteSpaceAnalyzer);
2298
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
2299
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize",
2300
+ frt_white_space_analyzer_init, -1);
2301
+ }
2302
+
2303
+ /*
2304
+ * Document-class: Ferret::Analysis::AsciiStandardAnalyzer
2305
+ *
2306
+ * == Summary
2307
+ *
2308
+ * The AsciiStandardAnalyzer is the most advanced of the available
2309
+ * ASCII-analyzers. If it were implemented in Ruby it would look like this;
2310
+ *
2311
+ * class AsciiStandardAnalyzer
2312
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2313
+ * @lower = lower
2314
+ * @stop_words = stop_words
2315
+ * end
2316
+ *
2317
+ * def token_stream(field, str)
2318
+ * ts = AsciiStandardTokenizer.new(str)
2319
+ * ts = AsciiLowerCaseFilter.new(ts) if @lower
2320
+ * ts = StopFilter.new(ts, @stop_words)
2321
+ * ts = HyphenFilter.new(ts)
2322
+ * end
2323
+ * end
2324
+ *
2325
+ * As you can see it makes use of the AsciiStandardTokenizer and you can also
2326
+ * add your own list of stop-words if you wish. Note that this tokenizer
2327
+ * won't recognize non-ASCII characters so you should use the
2328
+ * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
2329
+ */
2330
+ static void Init_AsciiStandardAnalyzer(void)
2331
+ {
2332
+ cAsciiStandardAnalyzer =
2333
+ rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2334
+ frt_mark_cclass(cAsciiStandardAnalyzer);
2335
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
2336
+ rb_define_method(cAsciiStandardAnalyzer, "initialize",
2337
+ frt_a_standard_analyzer_init, -1);
2338
+ }
2339
+
2340
+ /*
2341
+ * Document-class: Ferret::Analysis::StandardAnalyzer
2342
+ *
2343
+ * == Summary
2344
+ *
2345
+ * The StandardAnalyzer is the most advanced of the available analyzers. If
2346
+ * it were implemented in Ruby it would look like this;
2347
+ *
2348
+ * class StandardAnalyzer
2349
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2350
+ * @lower = lower
2351
+ * @stop_words = stop_words
2352
+ * end
2353
+ *
2354
+ * def token_stream(field, str)
2355
+ * ts = StandardTokenizer.new(str)
2356
+ * ts = LowerCaseFilter.new(ts) if @lower
2357
+ * ts = StopFilter.new(ts, @stop_words)
2358
+ * ts = HyphenFilter.new(ts)
2359
+ * end
2360
+ * end
2361
+ *
2362
+ * As you can see it makes use of the StandardTokenizer and you can also add
2363
+ * your own list of stopwords if you wish.
2364
+ */
2365
+ static void Init_StandardAnalyzer(void)
2366
+ {
2367
+ cStandardAnalyzer =
2368
+ rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2369
+ frt_mark_cclass(cStandardAnalyzer);
2370
+ rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
2371
+ rb_define_method(cStandardAnalyzer, "initialize",
2372
+ frt_standard_analyzer_init, -1);
2373
+ }
2374
+
2375
+ /*
2376
+ * Document-class: Ferret::Analysis::PerFieldAnalyzer
2377
+ *
2378
+ * == Summary
2379
+ *
2380
+ * The PerFieldAnalyzer is for use when you want to analyze different fields
2381
+ * with different analyzers. With the PerFieldAnalyzer you can specify how
2382
+ * you want each field analyzed.
2383
+ *
2384
+ * === Example
2385
+ *
2386
+ * # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
2387
+ * pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
2388
+ *
2389
+ * # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
2390
+ * pfa[:title] = WhiteSpaceAnalyzer.new(false)
2391
+ *
2392
+ * # Use a custom analyzer on the :created_at field
2393
+ * pfa[:created_at] = DateAnalyzer.new
2394
+ */
2395
+ static void Init_PerFieldAnalyzer(void)
2396
+ {
2397
+ cPerFieldAnalyzer =
2398
+ rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2399
+ frt_mark_cclass(cPerFieldAnalyzer);
2400
+ rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
2401
+ rb_define_method(cPerFieldAnalyzer, "initialize",
2402
+ frt_per_field_analyzer_init, 1);
2403
+ rb_define_method(cPerFieldAnalyzer, "add_field",
2404
+ frt_per_field_analyzer_add_field, 2);
2405
+ rb_define_method(cPerFieldAnalyzer, "[]=",
2406
+ frt_per_field_analyzer_add_field, 2);
2407
+ rb_define_method(cPerFieldAnalyzer, "token_stream",
2408
+ frt_pfa_analyzer_token_stream, 2);
2409
+ }
2410
+
2411
+ /*
2412
+ * Document-class: Ferret::Analysis::RegExpAnalyzer
2413
+ *
2414
+ * == Summary
2415
+ *
2416
+ * Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
2417
+ * implemented in Ruby it would look like this;
2418
+ *
2419
+ * class RegExpAnalyzer
2420
+ * def initialize(reg_exp, lower = true)
2421
+ * @lower = lower
2422
+ * @reg_exp = reg_exp
2423
+ * end
2424
+ *
2425
+ * def token_stream(field, str)
2426
+ * if @lower
2427
+ * return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
2428
+ * else
2429
+ * return RegExpTokenizer.new(str, reg_exp)
2430
+ * end
2431
+ * end
2432
+ * end
2433
+ *
2434
+ * === Example
2435
+ *
2436
+ * csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
2437
+ */
2438
+ static void Init_RegExpAnalyzer(void)
2439
+ {
2440
+ cRegExpAnalyzer =
2441
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2442
+ frt_mark_cclass(cRegExpAnalyzer);
2443
+ rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2444
+ rb_define_method(cRegExpAnalyzer, "initialize",
2445
+ frt_re_analyzer_init, -1);
2446
+ rb_define_method(cRegExpAnalyzer, "token_stream",
2447
+ frt_re_analyzer_token_stream, 2);
2448
+ }
2449
+
2450
+ /* rdoc hack
2451
+ extern VALUE mFerret = rb_define_module("Ferret");
2452
+ */
2453
+
2454
+ /*
2455
+ * Document-module: Ferret::Analysis
2456
+ *
2457
+ * == Summary
2458
+ *
2459
+ * The Analysis module contains all the classes used to analyze and tokenize
2460
+ * the data to be indexed. There are three main classes you need to know
2461
+ * about when dealing with analysis; Analyzer, TokenStream and Token.
2462
+ *
2463
+ * == Classes
2464
+ *
2465
+ * === Analyzer
2466
+ *
2467
+ * Analyzers handle all of your tokenizing needs. You pass an Analyzer to the
2468
+ * indexing class when you create it and it will create the TokenStreams
2469
+ * necessary to tokenize the fields in the documents. Most of the time you
2470
+ * won't need to worry about TokenStreams and Tokens, one of the Analyzers
2471
+ * distributed with Ferret will do exactly what you need. Otherwise you'll
2472
+ * need to implement a custom analyzer.
2473
+ *
2474
+ * === TokenStream
2475
+ *
2476
+ * A TokenStream is an enumeration of Tokens. There are two standard types of
2477
+ * TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
2478
+ * turns it into a list of Tokens. A TokenFilter takes another TokenStream
2479
+ * and post-processes the Tokens. You can chain as many TokenFilters together
2480
+ * as you like but they always need to finish with a Tokenizer.
2481
+ *
2482
+ * === Token
2483
+ *
2484
+ * A Token is a single term from a document field. A token contains the text
2485
+ * representing the term as well as the start and end offset of the token.
2486
+ * The start and end offset will represent the token as it appears in the
2487
+ * source field. Some TokenFilters may change the text in the Token but the
2488
+ * start and end offsets should stay the same so (end - start) won't
2489
+ * necessarily be equal to the length of text in the token. For example using
2490
+ * a stemming TokenFilter the term "Beginning" might have start and end
2491
+ * offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
2492
+ * might be "begin" (after stemming).
2493
+ */
2494
+ void
2495
+ Init_Analysis(void)
2496
+ {
2497
+ mAnalysis = rb_define_module_under(mFerret, "Analysis");
2498
+
2499
+ /* TokenStream Methods */
2500
+ id_next = rb_intern("next");
2501
+ id_reset = rb_intern("text=");
2502
+ id_clone = rb_intern("clone");
2503
+ id_text = rb_intern("@text");
2504
+
2505
+ /* Analyzer Methods */
2506
+ id_token_stream = rb_intern("token_stream");
2507
+
2508
+ object_space = rb_hash_new();
2509
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2510
+
2511
+ /*** * * Locale stuff * * ***/
2512
+ rb_define_singleton_method(mFerret, "locale=", frt_set_locale, 1);
2513
+ rb_define_singleton_method(mFerret, "locale", frt_get_locale, 0);
2514
+
2515
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2516
+ get_rstopwords(ENGLISH_STOP_WORDS));
2517
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
2518
+ get_rstopwords(FULL_ENGLISH_STOP_WORDS));
2519
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
2520
+ get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
2521
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
2522
+ get_rstopwords(FULL_FRENCH_STOP_WORDS));
2523
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
2524
+ get_rstopwords(FULL_SPANISH_STOP_WORDS));
2525
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
2526
+ get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
2527
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
2528
+ get_rstopwords(FULL_ITALIAN_STOP_WORDS));
2529
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
2530
+ get_rstopwords(FULL_GERMAN_STOP_WORDS));
2531
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
2532
+ get_rstopwords(FULL_DUTCH_STOP_WORDS));
2533
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
2534
+ get_rstopwords(FULL_SWEDISH_STOP_WORDS));
2535
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
2536
+ get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
2537
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
2538
+ get_rstopwords(FULL_DANISH_STOP_WORDS));
2539
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
2540
+ get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
2541
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2542
+ get_rstopwords(FULL_FINNISH_STOP_WORDS));
2543
+
2544
+ Init_Token();
2545
+ Init_TokenStream();
2546
+
2547
+ Init_AsciiLetterTokenizer();
2548
+ Init_LetterTokenizer();
2549
+
2550
+ Init_AsciiWhiteSpaceTokenizer();
2551
+ Init_WhiteSpaceTokenizer();
2552
+
2553
+ Init_AsciiStandardTokenizer();
2554
+ Init_StandardTokenizer();
2555
+
2556
+ Init_RegExpTokenizer();
2557
+
2558
+ Init_AsciiLowerCaseFilter();
2559
+ Init_LowerCaseFilter();
2560
+ Init_HyphenFilter();
2561
+ Init_StopFilter();
2562
+ Init_MappingFilter();
2563
+ Init_StemFilter();
2564
+
2565
+ Init_Analyzer();
2566
+ Init_AsciiLetterAnalyzer();
2567
+ Init_LetterAnalyzer();
2568
+ Init_AsciiWhiteSpaceAnalyzer();
2569
+ Init_WhiteSpaceAnalyzer();
2570
+ Init_AsciiStandardAnalyzer();
2571
+ Init_StandardAnalyzer();
2572
+ Init_PerFieldAnalyzer();
2573
+ Init_RegExpAnalyzer();
2574
+
2575
+ }