sa-ferret 0.11.6.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (193) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1588 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/index.c +6425 -0
  37. data/ext/index.h +961 -0
  38. data/ext/lang.h +48 -0
  39. data/ext/libstemmer.c +92 -0
  40. data/ext/libstemmer.h +79 -0
  41. data/ext/mempool.c +87 -0
  42. data/ext/mempool.h +35 -0
  43. data/ext/modules.h +162 -0
  44. data/ext/multimapper.c +310 -0
  45. data/ext/multimapper.h +51 -0
  46. data/ext/posh.c +1006 -0
  47. data/ext/posh.h +1007 -0
  48. data/ext/priorityqueue.c +151 -0
  49. data/ext/priorityqueue.h +143 -0
  50. data/ext/q_boolean.c +1608 -0
  51. data/ext/q_const_score.c +161 -0
  52. data/ext/q_filtered_query.c +209 -0
  53. data/ext/q_fuzzy.c +268 -0
  54. data/ext/q_match_all.c +148 -0
  55. data/ext/q_multi_term.c +677 -0
  56. data/ext/q_parser.c +2825 -0
  57. data/ext/q_phrase.c +1126 -0
  58. data/ext/q_prefix.c +100 -0
  59. data/ext/q_range.c +350 -0
  60. data/ext/q_span.c +2402 -0
  61. data/ext/q_term.c +337 -0
  62. data/ext/q_wildcard.c +171 -0
  63. data/ext/r_analysis.c +2499 -0
  64. data/ext/r_index.c +3485 -0
  65. data/ext/r_qparser.c +585 -0
  66. data/ext/r_search.c +4107 -0
  67. data/ext/r_store.c +513 -0
  68. data/ext/r_utils.c +963 -0
  69. data/ext/ram_store.c +471 -0
  70. data/ext/search.c +1741 -0
  71. data/ext/search.h +885 -0
  72. data/ext/similarity.c +150 -0
  73. data/ext/similarity.h +82 -0
  74. data/ext/sort.c +983 -0
  75. data/ext/stem_ISO_8859_1_danish.c +338 -0
  76. data/ext/stem_ISO_8859_1_danish.h +16 -0
  77. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  78. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  79. data/ext/stem_ISO_8859_1_english.c +1156 -0
  80. data/ext/stem_ISO_8859_1_english.h +16 -0
  81. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  82. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  83. data/ext/stem_ISO_8859_1_french.c +1276 -0
  84. data/ext/stem_ISO_8859_1_french.h +16 -0
  85. data/ext/stem_ISO_8859_1_german.c +512 -0
  86. data/ext/stem_ISO_8859_1_german.h +16 -0
  87. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  88. data/ext/stem_ISO_8859_1_italian.h +16 -0
  89. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  90. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  91. data/ext/stem_ISO_8859_1_porter.c +776 -0
  92. data/ext/stem_ISO_8859_1_porter.h +16 -0
  93. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  94. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  95. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  96. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  97. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  98. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  99. data/ext/stem_KOI8_R_russian.c +701 -0
  100. data/ext/stem_KOI8_R_russian.h +16 -0
  101. data/ext/stem_UTF_8_danish.c +344 -0
  102. data/ext/stem_UTF_8_danish.h +16 -0
  103. data/ext/stem_UTF_8_dutch.c +653 -0
  104. data/ext/stem_UTF_8_dutch.h +16 -0
  105. data/ext/stem_UTF_8_english.c +1176 -0
  106. data/ext/stem_UTF_8_english.h +16 -0
  107. data/ext/stem_UTF_8_finnish.c +808 -0
  108. data/ext/stem_UTF_8_finnish.h +16 -0
  109. data/ext/stem_UTF_8_french.c +1296 -0
  110. data/ext/stem_UTF_8_french.h +16 -0
  111. data/ext/stem_UTF_8_german.c +526 -0
  112. data/ext/stem_UTF_8_german.h +16 -0
  113. data/ext/stem_UTF_8_italian.c +1113 -0
  114. data/ext/stem_UTF_8_italian.h +16 -0
  115. data/ext/stem_UTF_8_norwegian.c +302 -0
  116. data/ext/stem_UTF_8_norwegian.h +16 -0
  117. data/ext/stem_UTF_8_porter.c +794 -0
  118. data/ext/stem_UTF_8_porter.h +16 -0
  119. data/ext/stem_UTF_8_portuguese.c +1055 -0
  120. data/ext/stem_UTF_8_portuguese.h +16 -0
  121. data/ext/stem_UTF_8_russian.c +709 -0
  122. data/ext/stem_UTF_8_russian.h +16 -0
  123. data/ext/stem_UTF_8_spanish.c +1137 -0
  124. data/ext/stem_UTF_8_spanish.h +16 -0
  125. data/ext/stem_UTF_8_swedish.c +313 -0
  126. data/ext/stem_UTF_8_swedish.h +16 -0
  127. data/ext/stopwords.c +401 -0
  128. data/ext/store.c +692 -0
  129. data/ext/store.h +777 -0
  130. data/ext/term_vectors.c +352 -0
  131. data/ext/threading.h +31 -0
  132. data/ext/utilities.c +446 -0
  133. data/ext/win32.h +54 -0
  134. data/lib/ferret.rb +29 -0
  135. data/lib/ferret/browser.rb +246 -0
  136. data/lib/ferret/browser/s/global.js +192 -0
  137. data/lib/ferret/browser/s/style.css +148 -0
  138. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  139. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  140. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  141. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  142. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  143. data/lib/ferret/browser/views/layout.rhtml +22 -0
  144. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  145. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  146. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  147. data/lib/ferret/browser/webrick.rb +14 -0
  148. data/lib/ferret/document.rb +130 -0
  149. data/lib/ferret/field_infos.rb +44 -0
  150. data/lib/ferret/index.rb +786 -0
  151. data/lib/ferret/number_tools.rb +157 -0
  152. data/lib/ferret_version.rb +3 -0
  153. data/setup.rb +1555 -0
  154. data/test/test_all.rb +5 -0
  155. data/test/test_helper.rb +24 -0
  156. data/test/threading/number_to_spoken.rb +132 -0
  157. data/test/threading/thread_safety_index_test.rb +79 -0
  158. data/test/threading/thread_safety_read_write_test.rb +76 -0
  159. data/test/threading/thread_safety_test.rb +133 -0
  160. data/test/unit/analysis/tc_analyzer.rb +548 -0
  161. data/test/unit/analysis/tc_token_stream.rb +646 -0
  162. data/test/unit/index/tc_index.rb +762 -0
  163. data/test/unit/index/tc_index_reader.rb +699 -0
  164. data/test/unit/index/tc_index_writer.rb +437 -0
  165. data/test/unit/index/th_doc.rb +315 -0
  166. data/test/unit/largefile/tc_largefile.rb +46 -0
  167. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  168. data/test/unit/search/tc_filter.rb +135 -0
  169. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  170. data/test/unit/search/tc_index_searcher.rb +61 -0
  171. data/test/unit/search/tc_multi_searcher.rb +128 -0
  172. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  173. data/test/unit/search/tc_search_and_sort.rb +179 -0
  174. data/test/unit/search/tc_sort.rb +49 -0
  175. data/test/unit/search/tc_sort_field.rb +27 -0
  176. data/test/unit/search/tc_spans.rb +190 -0
  177. data/test/unit/search/tm_searcher.rb +384 -0
  178. data/test/unit/store/tc_fs_store.rb +77 -0
  179. data/test/unit/store/tc_ram_store.rb +35 -0
  180. data/test/unit/store/tm_store.rb +34 -0
  181. data/test/unit/store/tm_store_lock.rb +68 -0
  182. data/test/unit/tc_document.rb +81 -0
  183. data/test/unit/ts_analysis.rb +2 -0
  184. data/test/unit/ts_index.rb +2 -0
  185. data/test/unit/ts_largefile.rb +4 -0
  186. data/test/unit/ts_query_parser.rb +2 -0
  187. data/test/unit/ts_search.rb +2 -0
  188. data/test/unit/ts_store.rb +2 -0
  189. data/test/unit/ts_utils.rb +2 -0
  190. data/test/unit/utils/tc_bit_vector.rb +295 -0
  191. data/test/unit/utils/tc_number_tools.rb +117 -0
  192. data/test/unit/utils/tc_priority_queue.rb +106 -0
  193. metadata +269 -0
data/ext/r_analysis.c ADDED
@@ -0,0 +1,2499 @@
1
+ #include <regex.h>
2
+ #include <locale.h>
3
+ #include <st.h>
4
+ #include "ferret.h"
5
+ #include "analysis.h"
6
+
7
+ static char *frt_locale = NULL;
8
+
9
+ static VALUE mAnalysis;
10
+
11
+ static VALUE cToken;
12
+ static VALUE cAsciiLetterTokenizer;
13
+ static VALUE cLetterTokenizer;
14
+ static VALUE cAsciiWhiteSpaceTokenizer;
15
+ static VALUE cWhiteSpaceTokenizer;
16
+ static VALUE cAsciiStandardTokenizer;
17
+ static VALUE cStandardTokenizer;
18
+ static VALUE cRegExpTokenizer;
19
+
20
+ static VALUE cAsciiLowerCaseFilter;
21
+ static VALUE cLowerCaseFilter;
22
+ static VALUE cStopFilter;
23
+ static VALUE cMappingFilter;
24
+ static VALUE cHyphenFilter;
25
+ static VALUE cStemFilter;
26
+
27
+ static VALUE cAnalyzer;
28
+ static VALUE cAsciiLetterAnalyzer;
29
+ static VALUE cLetterAnalyzer;
30
+ static VALUE cAsciiWhiteSpaceAnalyzer;
31
+ static VALUE cWhiteSpaceAnalyzer;
32
+ static VALUE cAsciiStandardAnalyzer;
33
+ static VALUE cStandardAnalyzer;
34
+ static VALUE cPerFieldAnalyzer;
35
+ static VALUE cRegExpAnalyzer;
36
+
37
+ static VALUE cTokenStream;
38
+
39
+ /* TokenStream Methods */
40
+ static ID id_next;
41
+ static ID id_reset;
42
+ static ID id_clone;
43
+ static ID id_text;
44
+
45
+ /* Analyzer Methods */
46
+ static ID id_token_stream;
47
+
48
+ static VALUE object_space;
49
+
50
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
51
+ int, struct re_registers *);
52
+
53
+ int
54
+ frt_rb_hash_size(VALUE hash)
55
+ {
56
+ return RHASH(hash)->tbl->num_entries;
57
+ }
58
+
59
+ /****************************************************************************
60
+ *
61
+ * Utility Methods
62
+ *
63
+ ****************************************************************************/
64
+
65
+ static char **
66
+ get_stopwords(VALUE rstop_words)
67
+ {
68
+ char **stop_words;
69
+ int i, len;
70
+ VALUE rstr;
71
+ Check_Type(rstop_words, T_ARRAY);
72
+ len = RARRAY(rstop_words)->len;
73
+ stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
74
+ stop_words[len] = NULL;
75
+ for (i = 0; i < len; i++) {
76
+ rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
77
+ stop_words[i] = rs2s(rstr);
78
+ }
79
+ return stop_words;
80
+ }
81
+
82
+ /****************************************************************************
83
+ *
84
+ * token methods
85
+ *
86
+ ****************************************************************************/
87
+
88
+ typedef struct RToken {
89
+ VALUE text;
90
+ int start;
91
+ int end;
92
+ int pos_inc;
93
+ } RToken;
94
+
95
+ static void
96
+ frt_token_free(void *p)
97
+ {
98
+ free(p);
99
+ }
100
+
101
+ static void
102
+ frt_token_mark(void *p)
103
+ {
104
+ RToken *token = (RToken *)p;
105
+ rb_gc_mark(token->text);
106
+ }
107
+
108
+ static VALUE
109
+ frt_token_alloc(VALUE klass)
110
+ {
111
+ return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free,
112
+ ALLOC(RToken));
113
+ }
114
+
115
+ static VALUE
116
+ get_token(Token *tk)
117
+ {
118
+ RToken *token = ALLOC(RToken);
119
+
120
+ token->text = rb_str_new2(tk->text);
121
+ token->start = tk->start;
122
+ token->end = tk->end;
123
+ token->pos_inc = tk->pos_inc;
124
+ return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
125
+ }
126
+
127
+ Token *
128
+ frt_set_token(Token *tk, VALUE rt)
129
+ {
130
+ RToken *rtk;
131
+
132
+ if (rt == Qnil) return NULL;
133
+
134
+ Data_Get_Struct(rt, RToken, rtk);
135
+ tk_set(tk, rs2s(rtk->text), RSTRING(rtk->text)->len,
136
+ rtk->start, rtk->end, rtk->pos_inc);
137
+ return tk;
138
+ }
139
+
140
+ #define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
141
+
142
+ /*
143
+ * call-seq:
144
+ * Token.new(text, start, end, pos_inc = 1) -> new Token
145
+ *
146
+ * Creates a new token setting the text, start and end offsets of the token
147
+ * and the position increment for the token.
148
+ *
149
+ * The position increment is usually set to 1 but you can set it to other
150
+ * values as needed. For example, if you have a stop word filter you will be
151
+ * skipping tokens. Let's say you have the stop words "the" and "and" and you
152
+ * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
153
+ * "Sea" will have the position increments 2, 1 and 3 respectively.
154
+ *
155
+ * Another reason you might want to vary the position increment is if you are
156
+ * adding synonyms to the index. For example let's say you have the synonym
157
+ * group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
158
+ * speedy delivery", you'll add "speedy" first with a position increment of 1
159
+ * and then "fast" and "quick" with position increments of 0 since they are
160
+ * represented in the same position.
161
+ *
162
+ * The offset set values +start+ and +end+ should be byte offsets, not
163
+ * character offsets. This makes it easy to use those offsets to quickly
164
+ * access the token in the input string and also to insert highlighting tags
165
+ * when necessary.
166
+ *
167
+ * text:: the main text for the token.
168
+ * start:: the start offset of the token in bytes.
169
+ * end:: the end offset of the token in bytes.
170
+ * pos_inc:: the position increment of a token. See above.
171
+ * return:: a newly created and assigned Token object
172
+ */
173
+ static VALUE
174
+ frt_token_init(int argc, VALUE *argv, VALUE self)
175
+ {
176
+ RToken *token;
177
+ VALUE rtext, rstart, rend, rpos_inc, rtype;
178
+ GET_TK(token, self);
179
+ token->pos_inc = 1;
180
+ switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
181
+ &rend, &rpos_inc, &rtype)) {
182
+ case 5: /* type gets ignored at this stage */
183
+ case 4: token->pos_inc = FIX2INT(rpos_inc);
184
+ }
185
+ token->text = rb_obj_as_string(rtext);
186
+ token->start = FIX2INT(rstart);
187
+ token->end = FIX2INT(rend);
188
+ return self;
189
+ }
190
+
191
+ /*
192
+ * call-seq:
193
+ * token.cmp(other_token) -> bool
194
+ *
195
+ * Used to compare two tokens. Token is extended by Comparable so you can
196
+ * also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
197
+ *
198
+ * Tokens are sorted by the position in the text at which they occur, ie
199
+ * the start offset. If two tokens have the same start offset, (see
200
+ * pos_inc=) then, they are sorted by the end offset and then
201
+ * lexically by the token text.
202
+ */
203
+ static VALUE
204
+ frt_token_cmp(VALUE self, VALUE rother)
205
+ {
206
+ RToken *token, *other;
207
+ int cmp;
208
+ GET_TK(token, self);
209
+ GET_TK(other, rother);
210
+ if (token->start > other->start) {
211
+ cmp = 1;
212
+ } else if (token->start < other->start) {
213
+ cmp = -1;
214
+ } else {
215
+ if (token->end > other->end) {
216
+ cmp = 1;
217
+ } else if (token->end < other->end) {
218
+ cmp = -1;
219
+ } else {
220
+ cmp = strcmp(rs2s(token->text), rs2s(other->text));
221
+ }
222
+ }
223
+ return INT2FIX(cmp);
224
+ }
225
+
226
+ /*
227
+ * call-seq:
228
+ * token.text -> text
229
+ *
230
+ * Returns the text that this token represents
231
+ */
232
+ static VALUE
233
+ frt_token_get_text(VALUE self)
234
+ {
235
+ RToken *token;
236
+ GET_TK(token, self);
237
+ return token->text;
238
+ }
239
+
240
+ /*
241
+ * call-seq:
242
+ * token.text = text -> text
243
+ *
244
+ * Set the text for this token.
245
+ */
246
+ static VALUE
247
+ frt_token_set_text(VALUE self, VALUE rtext)
248
+ {
249
+ RToken *token;
250
+ GET_TK(token, self);
251
+ token->text = rtext;
252
+ return rtext;
253
+ }
254
+
255
+ /*
256
+ * call-seq:
257
+ * token.start -> integer
258
+ *
259
+ * Start byte-position of this token
260
+ */
261
+ static VALUE
262
+ frt_token_get_start_offset(VALUE self)
263
+ {
264
+ RToken *token;
265
+ GET_TK(token, self);
266
+ return INT2FIX(token->start);
267
+ }
268
+
269
+ /*
270
+ * call-seq:
271
+ * token.end -> integer
272
+ *
273
+ * End byte-position of this token
274
+ */
275
+ static VALUE
276
+ frt_token_get_end_offset(VALUE self)
277
+ {
278
+ RToken *token;
279
+ GET_TK(token, self);
280
+ return INT2FIX(token->end);
281
+ }
282
+
283
+ /*
284
+ * call-seq:
285
+ * token.pos_inc -> integer
286
+ *
287
+ * Position Increment for this token
288
+ */
289
+ static VALUE
290
+ frt_token_get_pos_inc(VALUE self)
291
+ {
292
+ RToken *token;
293
+ GET_TK(token, self);
294
+ return INT2FIX(token->pos_inc);
295
+ }
296
+
297
+ /*
298
+ * call-seq:
299
+ * token.start = start -> integer
300
+ *
301
+ * Set start byte-position of this token
302
+ */
303
+ static VALUE
304
+ frt_token_set_start_offset(VALUE self, VALUE rstart)
305
+ {
306
+ RToken *token;
307
+ GET_TK(token, self);
308
+ token->start = FIX2INT(rstart);
309
+ return rstart;
310
+ }
311
+
312
+ /*
313
+ * call-seq:
314
+ * token.end = end -> integer
315
+ *
316
+ * Set end byte-position of this token
317
+ */
318
+ static VALUE
319
+ frt_token_set_end_offset(VALUE self, VALUE rend)
320
+ {
321
+ RToken *token;
322
+ GET_TK(token, self);
323
+ token->end = FIX2INT(rend);
324
+ return rend;
325
+ }
326
+
327
+ /*
328
+ * call-seq:
329
+ * token.pos_inc = pos_inc -> integer
330
+ *
331
+ * Set the position increment. This determines the position of this token
332
+ * relative to the previous Token in a TokenStream, used in phrase
333
+ * searching.
334
+ *
335
+ * The default value is 1.
336
+ *
337
+ * Some common uses for this are:
338
+ *
339
+ * * Set it to zero to put multiple terms in the same position. This is
340
+ * useful if, e.g., a word has multiple stems. Searches for phrases
341
+ * including either stem will match. In this case, all but the first
342
+ * stem's increment should be set to zero: the increment of the first
343
+ * instance should be one. Repeating a token with an increment of zero
344
+ * can also be used to boost the scores of matches on that token.
345
+ *
346
+ * * Set it to values greater than one to inhibit exact phrase matches.
347
+ * If, for example, one does not want phrases to match across removed
348
+ * stop words, then one could build a stop word filter that removes stop
349
+ * words and also sets the increment to the number of stop words removed
350
+ * before each non-stop word. Then exact phrase queries will only match
351
+ * when the terms occur with no intervening stop words.
352
+ *
353
+ */
354
+ static VALUE
355
+ frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
356
+ {
357
+ RToken *token;
358
+ GET_TK(token, self);
359
+ token->pos_inc = FIX2INT(rpos_inc);
360
+ return rpos_inc;
361
+ }
362
+
363
+ /*
364
+ * call-seq:
365
+ * token.to_s -> token_str
366
+ *
367
+ * Return a string representation of the token
368
+ */
369
+ static VALUE
370
+ frt_token_to_s(VALUE self)
371
+ {
372
+ RToken *token;
373
+ char *buf;
374
+ GET_TK(token, self);
375
+ buf = alloca(RSTRING(token->text)->len + 80);
376
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
377
+ token->start, token->end, token->pos_inc);
378
+ return rb_str_new2(buf);
379
+ }
380
+
381
+ /****************************************************************************
382
+ *
383
+ * TokenStream Methods
384
+ *
385
+ ****************************************************************************/
386
+
387
+ #define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
388
+
389
+ static void
390
+ frt_ts_mark(void *p)
391
+ {
392
+ TokenStream *ts = (TokenStream *)p;
393
+ if (ts->text) frt_gc_mark(&ts->text);
394
+ }
395
+
396
+ static void
397
+ frt_ts_free(TokenStream *ts)
398
+ {
399
+ if (object_get(&ts->text) != Qnil) {
400
+ object_del(&ts->text);
401
+ }
402
+ object_del(ts);
403
+ ts_deref(ts);
404
+ }
405
+
406
+ static void frt_rets_free(TokenStream *ts);
407
+ static void frt_rets_mark(TokenStream *ts);
408
+ static Token *rets_next(TokenStream *ts);
409
+
410
+ static VALUE
411
+ get_rb_token_stream(TokenStream *ts)
412
+ {
413
+ VALUE rts = object_get(ts);
414
+ if (rts == Qnil) {
415
+ if (ts->next == &rets_next) {
416
+ rts = Data_Wrap_Struct(cTokenStream, &frt_rets_mark,
417
+ &frt_rets_free, ts);
418
+ } else {
419
+ rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark,
420
+ &frt_ts_free, ts);
421
+ }
422
+ object_add(ts, rts);
423
+ }
424
+ return rts;
425
+ }
426
+
427
+ static INLINE VALUE
428
+ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
429
+ {
430
+ StringValue(rstr);
431
+ ts->reset(ts, rs2s(rstr));
432
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
433
+ object_add(&ts->text, rstr);
434
+ object_add(ts, self);
435
+ return self;
436
+ }
437
+
438
+ /*
439
+ * call-seq:
440
+ * token_stream.text = text -> text
441
+ *
442
+ * Set the text attribute of the TokenStream to the text you wish to be
443
+ * tokenized. For example, you may do this;
444
+ *
445
+ * token_stream.text = File.read(file_name)
446
+ */
447
+ static VALUE
448
+ frt_ts_set_text(VALUE self, VALUE rtext)
449
+ {
450
+ TokenStream *ts;
451
+ Data_Get_Struct(self, TokenStream, ts);
452
+ StringValue(rtext);
453
+ ts->reset(ts, rs2s(rtext));
454
+
455
+ /* prevent garbage collection */
456
+ rb_ivar_set(self, id_text, rtext);
457
+
458
+ return rtext;
459
+ }
460
+
461
+ /*
462
+ * call-seq:
463
+ * token_stream.text = text -> text
464
+ *
465
+ * Return the text that the TokenStream is tokenizing
466
+ */
467
+ static VALUE
468
+ frt_ts_get_text(VALUE self)
469
+ {
470
+ VALUE rtext = Qnil;
471
+ TokenStream *ts;
472
+ Data_Get_Struct(self, TokenStream, ts);
473
+ if ((rtext = object_get(&ts->text)) == Qnil) {
474
+ if (ts->text) {
475
+ rtext = rb_str_new2(ts->text);
476
+ object_set(&ts->text, rtext);
477
+ }
478
+ }
479
+ return rtext;
480
+ }
481
+
482
+ /*
483
+ * call-seq:
484
+ * token_stream.next -> token
485
+ *
486
+ * Return the next token from the TokenStream or nil if there are no more
487
+ * tokens.
488
+ */
489
+ static VALUE
490
+ frt_ts_next(VALUE self)
491
+ {
492
+ TokenStream *ts;
493
+ Token *next;
494
+ GET_TS(ts, self);
495
+ next = ts->next(ts);
496
+ if (next == NULL) {
497
+ return Qnil;
498
+ }
499
+
500
+ return get_token(next);
501
+ }
502
+
503
+ /****************************************************************************
504
+ * TokenFilter
505
+ ****************************************************************************/
506
+
507
+ #define TkFilt(filter) ((TokenFilter *)(filter))
508
+
509
+ static void
510
+ frt_tf_mark(void *p)
511
+ {
512
+ TokenStream *ts = (TokenStream *)p;
513
+ if (TkFilt(ts)->sub_ts) {
514
+ frt_gc_mark(&TkFilt(ts)->sub_ts);
515
+ }
516
+ }
517
+
518
+ static void
519
+ frt_tf_free(TokenStream *ts)
520
+ {
521
+ if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
522
+ object_del(&TkFilt(ts)->sub_ts);
523
+ }
524
+ object_del(ts);
525
+ ts_deref(ts);
526
+ }
527
+
528
+
529
+ /****************************************************************************
530
+ * CWrappedTokenStream
531
+ ****************************************************************************/
532
+
533
+ #define CachedTS(token_stream) ((CachedTokenStream *)(token_stream))
534
+ #define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
535
+
536
+ typedef struct CWrappedTokenStream {
537
+ CachedTokenStream super;
538
+ VALUE rts;
539
+ } CWrappedTokenStream;
540
+
541
+ static void
542
+ cwrts_destroy_i(TokenStream *ts)
543
+ {
544
+ if (object_get(&ts->text) != Qnil) {
545
+ object_del(&ts->text);
546
+ }
547
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
548
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
549
+ free(ts);
550
+ }
551
+
552
+ static Token *
553
+ cwrts_next(TokenStream *ts)
554
+ {
555
+ VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
556
+ return frt_set_token(&(CachedTS(ts)->token), rtoken);
557
+ }
558
+
559
+ static TokenStream *
560
+ cwrts_reset(TokenStream *ts, char *text)
561
+ {
562
+ ts->t = ts->text = text;
563
+ rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
564
+ return ts;
565
+ }
566
+
567
+ static TokenStream *
568
+ cwrts_clone_i(TokenStream *orig_ts)
569
+ {
570
+ TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
571
+ VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
572
+ rb_hash_aset(object_space, ((VALUE)new_ts)|1, rts);
573
+ return new_ts;
574
+ }
575
+
576
+ static TokenStream *
577
+ frt_get_cwrapped_rts(VALUE rts)
578
+ {
579
+ TokenStream *ts;
580
+ if (frt_is_cclass(rts) && DATA_PTR(rts)) {
581
+ GET_TS(ts, rts);
582
+ REF(ts);
583
+ }
584
+ else {
585
+ ts = ts_new(CWrappedTokenStream);
586
+ CWTS(ts)->rts = rts;
587
+ ts->next = &cwrts_next;
588
+ ts->reset = &cwrts_reset;
589
+ ts->clone_i = &cwrts_clone_i;
590
+ ts->destroy_i = &cwrts_destroy_i;
591
+ /* prevent from being garbage collected */
592
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rts);
593
+ ts->ref_cnt = 1;
594
+ }
595
+ return ts;
596
+ }
597
+
598
+ /****************************************************************************
599
+ * RegExpTokenStream
600
+ ****************************************************************************/
601
+
602
+ #define P "[_\\/.,-]"
603
+ #define HASDIGIT "\\w*\\d\\w*"
604
+ #define ALPHA "[-_[:alpha:]]"
605
+ #define ALNUM "[-_[:alnum:]]"
606
+
607
+ #define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
608
+
609
+ static const char *TOKEN_RE =
610
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
611
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
612
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
613
+ "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
614
+ "|(\\.\\w+)+"
615
+ "|"
616
+ ")";
617
+ static VALUE rtoken_re;
618
+
619
+ typedef struct RegExpTokenStream {
620
+ CachedTokenStream super;
621
+ VALUE rtext;
622
+ VALUE regex;
623
+ VALUE proc;
624
+ int curr_ind;
625
+ } RegExpTokenStream;
626
+
627
+ static void
628
+ rets_destroy_i(TokenStream *ts)
629
+ {
630
+ if (object_get(&ts->text) != Qnil) {
631
+ object_del(&ts->text);
632
+ }
633
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
634
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
635
+ free(ts);
636
+ }
637
+
638
+ static void
639
+ frt_rets_free(TokenStream *ts)
640
+ {
641
+ if (object_get(&ts->text) != Qnil) {
642
+ object_del(&ts->text);
643
+ }
644
+ object_del(ts);
645
+ ts_deref(ts);
646
+ }
647
+
648
+ static void
649
+ frt_rets_mark(TokenStream *ts)
650
+ {
651
+ if (ts->text) frt_gc_mark(&ts->text);
652
+ rb_gc_mark(RETS(ts)->rtext);
653
+ rb_gc_mark(RETS(ts)->regex);
654
+ rb_gc_mark(RETS(ts)->proc);
655
+ }
656
+
657
+ /*
658
+ * call-seq:
659
+ * tokenizer.text = text -> text
660
+ *
661
+ * Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
662
+ * tokenize the text from the beginning.
663
+ */
664
+ static VALUE
665
+ frt_rets_set_text(VALUE self, VALUE rtext)
666
+ {
667
+ TokenStream *ts;
668
+ GET_TS(ts, self);
669
+
670
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
671
+ StringValue(rtext);
672
+ RETS(ts)->rtext = rtext;
673
+ RETS(ts)->curr_ind = 0;
674
+
675
+ return rtext;
676
+ }
677
+
678
+ /*
679
+ * call-seq:
680
+ * tokenizer.text = text -> text
681
+ *
682
+ * Get the text being tokenized by the tokenizer.
683
+ */
684
+ static VALUE
685
+ frt_rets_get_text(VALUE self)
686
+ {
687
+ TokenStream *ts;
688
+ GET_TS(ts, self);
689
+ return RETS(ts)->rtext;
690
+ }
691
+
692
+ static Token *
693
+ rets_next(TokenStream *ts)
694
+ {
695
+ static struct re_registers regs;
696
+ int ret, beg, end;
697
+ struct RString *rtext = RSTRING(RETS(ts)->rtext);
698
+ Check_Type(RETS(ts)->regex, T_REGEXP);
699
+ ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
700
+ rtext->ptr, rtext->len,
701
+ RETS(ts)->curr_ind, rtext->len - RETS(ts)->curr_ind,
702
+ &regs);
703
+
704
+ if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
705
+ if (ret < 0) return NULL; /* not matched */
706
+
707
+ beg = regs.beg[0];
708
+ RETS(ts)->curr_ind = end = regs.end[0];
709
+ if (NIL_P(RETS(ts)->proc)) {
710
+ return tk_set(&(CachedTS(ts)->token), rtext->ptr + beg, end - beg,
711
+ beg, end, 1);
712
+ } else {
713
+ VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
714
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
715
+ return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
716
+ RSTRING(rtok)->len, beg, end, 1);
717
+ }
718
+ }
719
+
720
+ static TokenStream *
721
+ rets_reset(TokenStream *ts, char *text)
722
+ {
723
+ RETS(ts)->rtext = rb_str_new2(text);
724
+ RETS(ts)->curr_ind = 0;
725
+ return ts;
726
+ }
727
+
728
+ static TokenStream *
729
+ rets_clone_i(TokenStream *orig_ts)
730
+ {
731
+ TokenStream *ts = ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
732
+ return ts;
733
+ }
734
+
735
+ static TokenStream *
736
+ rets_new(VALUE rtext, VALUE regex, VALUE proc)
737
+ {
738
+ TokenStream *ts = ts_new(RegExpTokenStream);
739
+
740
+ if (rtext != Qnil) {
741
+ rtext = StringValue(rtext);
742
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
743
+ }
744
+ ts->reset = &rets_reset;
745
+ ts->next = &rets_next;
746
+ ts->clone_i = &rets_clone_i;
747
+ ts->destroy_i = &rets_destroy_i;
748
+
749
+ RETS(ts)->curr_ind = 0;
750
+ RETS(ts)->rtext = rtext;
751
+ RETS(ts)->proc = proc;
752
+
753
+ if (NIL_P(regex)) {
754
+ RETS(ts)->regex = rtoken_re;
755
+ } else {
756
+ Check_Type(regex, T_REGEXP);
757
+ RETS(ts)->regex = regex;
758
+ }
759
+
760
+ return ts;
761
+ }
762
+
763
+ /*
764
+ * call-seq:
765
+ * RegExpTokenizer.new(input, /[[:alpha:]]+/)
766
+ *
767
+ * Create a new tokenizer based on a regular expression
768
+ *
769
+ * input:: text to tokenizer
770
+ * regexp:: regular expression used to recognize tokens in the input
771
+ */
772
+ static VALUE
773
+ frt_rets_init(int argc, VALUE *argv, VALUE self)
774
+ {
775
+ VALUE rtext, regex, proc;
776
+ TokenStream *ts;
777
+
778
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
779
+
780
+ ts = rets_new(rtext, regex, proc);
781
+
782
+ Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
783
+ object_add(ts, self);
784
+ return self;
785
+ }
786
+
787
+ /****************************************************************************
788
+ * Tokenizers
789
+ ****************************************************************************/
790
+
791
+ #define TS_ARGS(dflt) \
792
+ bool lower;\
793
+ VALUE rlower, rstr;\
794
+ rb_scan_args(argc, argv, "11", &rstr, &rlower);\
795
+ lower = (argc ? RTEST(rlower) : dflt)
796
+
797
+ /*
798
+ * call-seq:
799
+ * AsciiLetterTokenizer.new() -> tokenizer
800
+ *
801
+ * Create a new AsciiLetterTokenizer
802
+ */
803
+ static VALUE
804
+ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
805
+ {
806
+ return get_wrapped_ts(self, rstr, letter_tokenizer_new());
807
+ }
808
+
809
+ /*
810
+ * call-seq:
811
+ * LetterTokenizer.new(lower = true) -> tokenizer
812
+ *
813
+ * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
814
+ * is done according the current locale.
815
+ *
816
+ * lower:: set to false if you don't wish to downcase tokens
817
+ */
818
+ static VALUE
819
+ frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
820
+ {
821
+ TS_ARGS(false);
822
+ #ifndef POSH_OS_WIN32
823
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
824
+ #endif
825
+ return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
826
+ }
827
+
828
+ /*
829
+ * call-seq:
830
+ * AsciiWhiteSpaceTokenizer.new() -> tokenizer
831
+ *
832
+ * Create a new AsciiWhiteSpaceTokenizer
833
+ */
834
+ static VALUE
835
+ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
836
+ {
837
+ return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
838
+ }
839
+
840
+ /*
841
+ * call-seq:
842
+ * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
843
+ *
844
+ * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
845
+ * Downcasing is done according the current locale.
846
+ *
847
+ * lower:: set to false if you don't wish to downcase tokens
848
+ */
849
+ static VALUE
850
+ frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
851
+ {
852
+ TS_ARGS(false);
853
+ #ifndef POSH_OS_WIN32
854
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
855
+ #endif
856
+ return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
857
+ }
858
+
859
+ /*
860
+ * call-seq:
861
+ * AsciiStandardTokenizer.new() -> tokenizer
862
+ *
863
+ * Create a new AsciiStandardTokenizer
864
+ */
865
+ static VALUE
866
+ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
867
+ {
868
+ return get_wrapped_ts(self, rstr, standard_tokenizer_new());
869
+ }
870
+
871
+ /*
872
+ * call-seq:
873
+ * StandardTokenizer.new(lower = true) -> tokenizer
874
+ *
875
+ * Create a new StandardTokenizer which optionally downcases tokens.
876
+ * Downcasing is done according the current locale.
877
+ *
878
+ * lower:: set to false if you don't wish to downcase tokens
879
+ */
880
+ static VALUE
881
+ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
882
+ {
883
+ #ifndef POSH_OS_WIN32
884
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
885
+ #endif
886
+ return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
887
+ }
888
+
889
+ /****************************************************************************
890
+ * Filters
891
+ ****************************************************************************/
892
+
893
+
894
+ /*
895
+ * call-seq:
896
+ * AsciiLowerCaseFilter.new(token_stream) -> token_stream
897
+ *
898
+ * Create an AsciiLowerCaseFilter which normalizes a token's text to
899
+ * lowercase but only for ASCII characters. For other characters use
900
+ * LowerCaseFilter.
901
+ */
902
+ static VALUE
903
+ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
904
+ {
905
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
906
+ ts = lowercase_filter_new(ts);
907
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
908
+
909
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
910
+ object_add(ts, self);
911
+ return self;
912
+ }
913
+
914
+ /*
915
+ * call-seq:
916
+ * LowerCaseFilter.new(token_stream) -> token_stream
917
+ *
918
+ * Create an LowerCaseFilter which normalizes a token's text to
919
+ * lowercase based on the current locale.
920
+ */
921
+ static VALUE
922
+ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
923
+ {
924
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
925
+ #ifndef POSH_OS_WIN32
926
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
927
+ #endif
928
+ ts = mb_lowercase_filter_new(ts);
929
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
930
+
931
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
932
+ object_add(ts, self);
933
+ return self;
934
+ }
935
+
936
+ /*
937
+ * call-seq:
938
+ * HyphenFilter.new(token_stream) -> token_stream
939
+ *
940
+ * Create an HyphenFilter which filters hyphenated words. The way it works is
941
+ * by adding both the word concatenated into a single word and split into
942
+ * multiple words. ie "e-mail" becomes "email" and "e mail". This way a
943
+ * search for "e-mail", "email" and "mail" will all match. This filter is
944
+ * used by default by the StandardAnalyzer.
945
+ */
946
+ static VALUE
947
+ frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
948
+ {
949
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
950
+ ts = hyphen_filter_new(ts);
951
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
952
+
953
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
954
+ object_add(ts, self);
955
+ return self;
956
+ }
957
+
958
+ /*
959
+ * call-seq:
960
+ * StopFilter.new(token_stream) -> token_stream
961
+ * StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
962
+ *
963
+ * Create an StopFilter which removes *stop-words* from a TokenStream. You can
964
+ * optionally specify the stopwords you wish to have removed.
965
+ *
966
+ * token_stream:: TokenStream to be filtered
967
+ * stop_words:: Array of *stop-words* you wish to be filtered out. This
968
+ * defaults to a list of English stop-words. The
969
+ * Ferret::Analysis contains a number of stop-word lists.
970
+ */
971
+ static VALUE
972
+ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
973
+ {
974
+ VALUE rsub_ts, rstop_words;
975
+ TokenStream *ts;
976
+ rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
977
+ ts = frt_get_cwrapped_rts(rsub_ts);
978
+ if (rstop_words != Qnil) {
979
+ char **stop_words = get_stopwords(rstop_words);
980
+ ts = stop_filter_new_with_words(ts, (const char **)stop_words);
981
+
982
+ free(stop_words);
983
+ } else {
984
+ ts = stop_filter_new(ts);
985
+ }
986
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
987
+
988
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
989
+ object_add(ts, self);
990
+ return self;
991
+ }
992
+
993
+ static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
994
+ {
995
+ switch (TYPE(from)) {
996
+ case T_STRING:
997
+ mapping_filter_add(mf, rs2s(from), to);
998
+ break;
999
+ case T_SYMBOL:
1000
+ mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
1001
+ break;
1002
+ default:
1003
+ rb_raise(rb_eArgError,
1004
+ "cannot map from %s with MappingFilter",
1005
+ rs2s(rb_obj_as_string(from)));
1006
+ break;
1007
+ }
1008
+ }
1009
+
1010
+ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1011
+ {
1012
+ if (key == Qundef) {
1013
+ return ST_CONTINUE;
1014
+ } else {
1015
+ TokenStream *mf = (TokenStream *)arg;
1016
+ char *to;
1017
+ switch (TYPE(value)) {
1018
+ case T_STRING:
1019
+ to = rs2s(value);
1020
+ break;
1021
+ case T_SYMBOL:
1022
+ to = rb_id2name(SYM2ID(value));
1023
+ break;
1024
+ default:
1025
+ rb_raise(rb_eArgError,
1026
+ "cannot map to %s with MappingFilter",
1027
+ rs2s(rb_obj_as_string(key)));
1028
+ break;
1029
+ }
1030
+ if (TYPE(key) == T_ARRAY) {
1031
+ int i;
1032
+ for (i = RARRAY(key)->len - 1; i >= 0; i--) {
1033
+ frt_add_mapping_i(mf, RARRAY(key)->ptr[i], to);
1034
+ }
1035
+ }
1036
+ else {
1037
+ frt_add_mapping_i(mf, key, to);
1038
+ }
1039
+ }
1040
+ return ST_CONTINUE;
1041
+ }
1042
+
1043
+
1044
+ /*
1045
+ * call-seq:
1046
+ * MappingFilter.new(token_stream, mapping) -> token_stream
1047
+ *
1048
+ * Create an MappingFilter which maps strings in tokens. This is usually used
1049
+ * to map UTF-8 characters to ASCII characters for easier searching and
1050
+ * better search recall. The mapping is compiled into a Deterministic Finite
1051
+ * Automata so it is super fast. This Filter can therefor be used for
1052
+ * indexing very large datasets. Currently regular expressions are not
1053
+ * supported. If you are really interested in the feature, please contact me
1054
+ * at dbalmain@gmail.com.
1055
+ *
1056
+ * token_stream:: TokenStream to be filtered
1057
+ * mapping:: Hash of mappings to apply to tokens. The key can be a
1058
+ * String or an Array of Strings. The value must be a String
1059
+ *
1060
+ * == Example
1061
+ *
1062
+ * filt = MappingFilter.new(token_stream,
1063
+ * {
1064
+ * ['à','á','â','ã','ä','å'] => 'a',
1065
+ * ['è','é','ê','ë','ē','ę'] => 'e'
1066
+ * })
1067
+ */
1068
+ static VALUE
1069
+ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1070
+ {
1071
+ TokenStream *ts;
1072
+ ts = frt_get_cwrapped_rts(rsub_ts);
1073
+ ts = mapping_filter_new(ts);
1074
+ rb_hash_foreach(mapping, frt_add_mappings_i, (VALUE)ts);
1075
+ mulmap_compile(((MappingFilter *)ts)->mapper);
1076
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1077
+
1078
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1079
+ object_add(ts, self);
1080
+ return self;
1081
+ }
1082
+
1083
+ /*
1084
+ * call-seq:
1085
+ * StemFilter.new(token_stream) -> token_stream
1086
+ * StemFilter.new(token_stream,
1087
+ * algorithm="english",
1088
+ * encoding="UTF-8") -> token_stream
1089
+ *
1090
+ * Create an StemFilter which uses a snowball stemmer (thank you Martin
1091
+ * Porter) to stem words. You can optionally specify the algorithm (default:
1092
+ * "english") and encoding (default: "UTF-8").
1093
+ *
1094
+ * token_stream:: TokenStream to be filtered
1095
+ * algorithm:: The algorithm (or language) to use
1096
+ * encoding:: The encoding of the data (default: "UTF-8")
1097
+ */
1098
+ static VALUE
1099
+ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
1100
+ {
1101
+ VALUE rsub_ts, ralgorithm, rcharenc;
1102
+ char *algorithm = "english";
1103
+ char *charenc = NULL;
1104
+ TokenStream *ts;
1105
+ rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
1106
+ ts = frt_get_cwrapped_rts(rsub_ts);
1107
+ switch (argc) {
1108
+ case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
1109
+ case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
1110
+ }
1111
+ ts = stem_filter_new(ts, algorithm, charenc);
1112
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1113
+
1114
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1115
+ object_add(ts, self);
1116
+ return self;
1117
+ }
1118
+
1119
+ /****************************************************************************
1120
+ *
1121
+ * Analyzer Methods
1122
+ *
1123
+ ****************************************************************************/
1124
+
1125
+ /****************************************************************************
1126
+ * CWrappedAnalyzer Methods
1127
+ ****************************************************************************/
1128
+
1129
+ #define GET_A(a, self) Data_Get_Struct(self, Analyzer, a)
1130
+
1131
+ #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
1132
+ typedef struct CWrappedAnalyzer
1133
+ {
1134
+ Analyzer super;
1135
+ VALUE ranalyzer;
1136
+ } CWrappedAnalyzer;
1137
+
1138
+ static void
1139
+ cwa_destroy_i(Analyzer *a)
1140
+ {
1141
+ rb_hash_delete(object_space, ((VALUE)a)|1);
1142
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
1143
+ free(a);
1144
+ }
1145
+
1146
+ static TokenStream *
1147
+ cwa_get_ts(Analyzer *a, char *field, char *text)
1148
+ {
1149
+ VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1150
+ ID2SYM(rb_intern(field)), rb_str_new2(text));
1151
+ return frt_get_cwrapped_rts(rts);
1152
+ }
1153
+
1154
+ Analyzer *
1155
+ frt_get_cwrapped_analyzer(VALUE ranalyzer)
1156
+ {
1157
+ Analyzer *a = NULL;
1158
+ if (frt_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
1159
+ Data_Get_Struct(ranalyzer, Analyzer, a);
1160
+ REF(a);
1161
+ }
1162
+ else {
1163
+ a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1164
+ a->destroy_i = &cwa_destroy_i;
1165
+ a->get_ts = &cwa_get_ts;
1166
+ a->ref_cnt = 1;
1167
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1168
+ /* prevent from being garbage collected */
1169
+ rb_hash_aset(object_space, ((VALUE)a)|1, ranalyzer);
1170
+ }
1171
+ return a;
1172
+ }
1173
+
1174
+ static void
1175
+ frt_analyzer_free(Analyzer *a)
1176
+ {
1177
+ object_del(a);
1178
+ a_deref(a);
1179
+ }
1180
+
1181
+ VALUE
1182
+ frt_get_analyzer(Analyzer *a)
1183
+ {
1184
+ VALUE self = Qnil;
1185
+ if (a) {
1186
+ self = object_get(a);
1187
+ if (self == Qnil) {
1188
+ self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
1189
+ REF(a);
1190
+ object_add(a, self);
1191
+ }
1192
+ }
1193
+ return self;
1194
+ }
1195
+
1196
+ INLINE VALUE
1197
+ get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
1198
+ {
1199
+ TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
1200
+
1201
+ /* Make sure that there is no entry already */
1202
+ object_set(&ts->text, rstring);
1203
+ return get_rb_token_stream(ts);
1204
+ }
1205
+
1206
+ /*
1207
+ * call-seq:
1208
+ * analyzer.token_stream(field_name, input) -> token_stream
1209
+ *
1210
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1211
+ * also depend on the +field_name+. Although this parameter is typically
1212
+ * ignored.
1213
+ *
1214
+ * field_name:: name of the field to be tokenized
1215
+ * input:: data from the field to be tokenized
1216
+ */
1217
+ static VALUE
1218
+ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1219
+ {
1220
+ /* NOTE: Any changes made to this method may also need to be applied to
1221
+ * frt_re_analyzer_token_stream */
1222
+ Analyzer *a;
1223
+ GET_A(a, self);
1224
+
1225
+ StringValue(rstring);
1226
+
1227
+ return get_rb_ts_from_a(a, rfield, rstring);
1228
+ }
1229
+
1230
+ #define GET_LOWER(dflt) \
1231
+ bool lower;\
1232
+ VALUE rlower;\
1233
+ rb_scan_args(argc, argv, "01", &rlower);\
1234
+ lower = (argc ? RTEST(rlower) : dflt)
1235
+
1236
+ /*
1237
+ * call-seq:
1238
+ * AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
1239
+ *
1240
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1241
+ * but can optionally leave case as is. Lowercasing will only be done to
1242
+ * ASCII characters.
1243
+ *
1244
+ * lower:: set to false if you don't want the field's tokens to be downcased
1245
+ */
1246
+ static VALUE
1247
+ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1248
+ {
1249
+ Analyzer *a;
1250
+ GET_LOWER(false);
1251
+ a = whitespace_analyzer_new(lower);
1252
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1253
+ object_add(a, self);
1254
+ return self;
1255
+ }
1256
+
1257
+ /*
1258
+ * call-seq:
1259
+ * WhiteSpaceAnalyzer.new(lower = false) -> analyzer
1260
+ *
1261
+ * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1262
+ * optionally leave case as is. Lowercasing will be done based on the current
1263
+ * locale.
1264
+ *
1265
+ * lower:: set to false if you don't want the field's tokens to be downcased
1266
+ */
1267
+ static VALUE
1268
+ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1269
+ {
1270
+ Analyzer *a;
1271
+ GET_LOWER(false);
1272
+ #ifndef POSH_OS_WIN32
1273
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1274
+ #endif
1275
+ a = mb_whitespace_analyzer_new(lower);
1276
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1277
+ object_add(a, self);
1278
+ return self;
1279
+ }
1280
+
1281
+ /*
1282
+ * call-seq:
1283
+ * AsciiLetterAnalyzer.new(lower = true) -> analyzer
1284
+ *
1285
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1286
+ * but can optionally leave case as is. Lowercasing will only be done to
1287
+ * ASCII characters.
1288
+ *
1289
+ * lower:: set to false if you don't want the field's tokens to be downcased
1290
+ */
1291
+ static VALUE
1292
+ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1293
+ {
1294
+ Analyzer *a;
1295
+ GET_LOWER(true);
1296
+ a = letter_analyzer_new(lower);
1297
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1298
+ object_add(a, self);
1299
+ return self;
1300
+ }
1301
+
1302
+ /*
1303
+ * call-seq:
1304
+ * LetterAnalyzer.new(lower = true) -> analyzer
1305
+ *
1306
+ * Create a new LetterAnalyzer which downcases tokens by default but can
1307
+ * optionally leave case as is. Lowercasing will be done based on the current
1308
+ * locale.
1309
+ *
1310
+ * lower:: set to false if you don't want the field's tokens to be downcased
1311
+ */
1312
+ static VALUE
1313
+ frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1314
+ {
1315
+ Analyzer *a;
1316
+ GET_LOWER(true);
1317
+ #ifndef POSH_OS_WIN32
1318
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1319
+ #endif
1320
+ a = mb_letter_analyzer_new(lower);
1321
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1322
+ object_add(a, self);
1323
+ return self;
1324
+ }
1325
+
1326
+ static VALUE
1327
+ get_rstopwords(const char **stop_words)
1328
+ {
1329
+ char **w = (char **)stop_words;
1330
+ VALUE rstopwords = rb_ary_new();
1331
+
1332
+ while (*w) {
1333
+ rb_ary_push(rstopwords, rb_str_new2(*w));
1334
+ w++;
1335
+ }
1336
+ return rstopwords;
1337
+ }
1338
+
1339
+ /*
1340
+ * call-seq:
1341
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
1342
+ * -> analyzer
1343
+ *
1344
+ * Create a new AsciiStandardAnalyzer which downcases tokens by default but
1345
+ * can optionally leave case as is. Lowercasing will be done based on the
1346
+ * current locale. You can also set the list of stop-words to be used by the
1347
+ * StopFilter.
1348
+ *
1349
+ * lower:: set to false if you don't want the field's tokens to be downcased
1350
+ * stop_words:: list of stop-words to pass to the StopFilter
1351
+ */
1352
+ static VALUE
1353
+ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1354
+ {
1355
+ bool lower;
1356
+ VALUE rlower, rstop_words;
1357
+ Analyzer *a;
1358
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1359
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1360
+ if (rstop_words != Qnil) {
1361
+ char **stop_words = get_stopwords(rstop_words);
1362
+ a = standard_analyzer_new_with_words((const char **)stop_words, lower);
1363
+ free(stop_words);
1364
+ } else {
1365
+ a = standard_analyzer_new(lower);
1366
+ }
1367
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1368
+ object_add(a, self);
1369
+ return self;
1370
+ }
1371
+
1372
+ /*
1373
+ * call-seq:
1374
+ * StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
1375
+ * -> analyzer
1376
+ *
1377
+ * Create a new StandardAnalyzer which downcases tokens by default but can
1378
+ * optionally leave case as is. Lowercasing will be done based on the current
1379
+ * locale. You can also set the list of stop-words to be used by the
1380
+ * StopFilter.
1381
+ *
1382
+ * lower:: set to false if you don't want the field's tokens to be downcased
1383
+ * stop_words:: list of stop-words to pass to the StopFilter
1384
+ */
1385
+ static VALUE
1386
+ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1387
+ {
1388
+ bool lower;
1389
+ VALUE rlower, rstop_words;
1390
+ Analyzer *a;
1391
+ #ifndef POSH_OS_WIN32
1392
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1393
+ #endif
1394
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1395
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1396
+ if (rstop_words != Qnil) {
1397
+ char **stop_words = get_stopwords(rstop_words);
1398
+ a = mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
1399
+ free(stop_words);
1400
+ } else {
1401
+ a = mb_standard_analyzer_new(lower);
1402
+ }
1403
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1404
+ object_add(a, self);
1405
+ return self;
1406
+ }
1407
+
1408
+ static void
1409
+ frt_h_mark_values_i(void *key, void *value, void *arg)
1410
+ {
1411
+ frt_gc_mark(value);
1412
+ }
1413
+
1414
+ static void
1415
+ frt_pfa_mark(void *p)
1416
+ {
1417
+ frt_gc_mark(PFA(p)->default_a);
1418
+ h_each(PFA(p)->dict, &frt_h_mark_values_i, NULL);
1419
+ }
1420
+
1421
+ /*** PerFieldAnalyzer ***/
1422
+
1423
+ /*
1424
+ * call-seq:
1425
+ * PerFieldAnalyzer.new(default_analyzer) -> analyzer
1426
+ *
1427
+ * Create a new PerFieldAnalyzer specifying the default analyzer to use on
1428
+ * all fields that are set specifically.
1429
+ *
1430
+ * default_analyzer:: analyzer to be used on fields that aren't otherwise
1431
+ * specified
1432
+ */
1433
+ static VALUE
1434
+ frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1435
+ {
1436
+ Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
1437
+ Analyzer *a = per_field_analyzer_new(def);
1438
+ Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
1439
+ object_add(a, self);
1440
+ return self;
1441
+ }
1442
+
1443
+ /*
1444
+ * call-seq:
1445
+ * per_field_analyzer.add_field(field_name, default_analyzer) -> self
1446
+ * per_field_analyzer[field_name] = default_analyzer -> self
1447
+ *
1448
+ * Set the analyzer to be used on field +field_name+. Note that field_name
1449
+ * should be a symbol.
1450
+ *
1451
+ * field_name:: field we wish to set the analyzer for
1452
+ * analyzer:: analyzer to be used on +field_name+
1453
+ */
1454
+ static VALUE
1455
+ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1456
+ {
1457
+ Analyzer *pfa, *a;
1458
+ Data_Get_Struct(self, Analyzer, pfa);
1459
+ a = frt_get_cwrapped_analyzer(ranalyzer);
1460
+
1461
+ pfa_add_field(pfa, frt_field(rfield), a);
1462
+ return self;
1463
+ }
1464
+
1465
+ /*
1466
+ * call-seq:
1467
+ * analyzer.token_stream(field_name, input) -> token_stream
1468
+ *
1469
+ * Create a new TokenStream to tokenize +input+. The TokenStream created will
1470
+ * also depend on the +field_name+ in the case of the PerFieldAnalyzer.
1471
+ *
1472
+ * field_name:: name of the field to be tokenized
1473
+ * input:: data from the field to be tokenized
1474
+ */
1475
+ static VALUE
1476
+ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1477
+ {
1478
+ Analyzer *pfa, *a;
1479
+ char *field = frt_field(rfield);
1480
+ GET_A(pfa, self);
1481
+
1482
+ StringValue(rstring);
1483
+ a = (Analyzer *)h_get(PFA(pfa)->dict, field);
1484
+ if (a == NULL) {
1485
+ a = PFA(pfa)->default_a;
1486
+ }
1487
+ if (a->get_ts == cwa_get_ts) {
1488
+ return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1489
+ ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
1490
+ }
1491
+ else {
1492
+ return get_rb_ts_from_a(a, rfield, rstring);
1493
+ }
1494
+ }
1495
+
1496
+ /*** RegExpAnalyzer ***/
1497
+
1498
+ static void
1499
+ frt_re_analyzer_mark(Analyzer *a)
1500
+ {
1501
+ frt_gc_mark(a->current_ts);
1502
+ }
1503
+
1504
+ static void
1505
+ re_analyzer_destroy_i(Analyzer *a)
1506
+ {
1507
+ ts_deref(a->current_ts);
1508
+ free(a);
1509
+ }
1510
+
1511
+ /*
1512
+ * call-seq:
1513
+ * RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
1514
+ *
1515
+ * Create a new RegExpAnalyzer which will create tokenizers based on the
1516
+ * regular expression and lowercasing if required.
1517
+ *
1518
+ * reg_exp:: the token matcher for the tokenizer to use
1519
+ * lower:: set to false if you don't want to downcase the tokens
1520
+ */
1521
+ static VALUE
1522
+ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1523
+ {
1524
+ VALUE lower, rets, regex, proc;
1525
+ Analyzer *a;
1526
+ TokenStream *ts;
1527
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1528
+
1529
+ ts = rets_new(Qnil, regex, proc);
1530
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1531
+ object_add(ts, rets);
1532
+
1533
+ if (lower != Qfalse) {
1534
+ rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
1535
+ ts = DATA_PTR(rets);
1536
+ }
1537
+ REF(ts);
1538
+
1539
+ a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1540
+ Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
1541
+ object_add(a, self);
1542
+ return self;
1543
+ }
1544
+
1545
+ /*
1546
+ * call-seq:
1547
+ * analyzer.token_stream(field_name, input) -> token_stream
1548
+ *
1549
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1550
+ * also depend on the +field_name+. Although this parameter is typically
1551
+ * ignored.
1552
+ *
1553
+ * field_name:: name of the field to be tokenized
1554
+ * input:: data from the field to be tokenized
1555
+ */
1556
+ static VALUE
1557
+ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1558
+ {
1559
+ TokenStream *ts;
1560
+ Analyzer *a;
1561
+ GET_A(a, self);
1562
+
1563
+ StringValue(rtext);
1564
+
1565
+ ts = a_get_ts(a, frt_field(rfield), rs2s(rtext));
1566
+
1567
+ /* Make sure that there is no entry already */
1568
+ object_set(&ts->text, rtext);
1569
+ if (ts->next == &rets_next) {
1570
+ RETS(ts)->rtext = rtext;
1571
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
1572
+ }
1573
+ else {
1574
+ RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
1575
+ rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext);
1576
+ }
1577
+ return get_rb_token_stream(ts);
1578
+ }
1579
+
1580
+ /****************************************************************************
1581
+ *
1582
+ * Locale stuff
1583
+ *
1584
+ ****************************************************************************/
1585
+
1586
+ /*
1587
+ * call-seq:
1588
+ * Ferret.locale -> locale_str
1589
+ *
1590
+ * Returns a string corresponding to the locale set. For example;
1591
+ *
1592
+ * puts Ferret.locale #=> "en_US.UTF-8"
1593
+ */
1594
+ static VALUE frt_get_locale(VALUE self, VALUE locale)
1595
+ {
1596
+ return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
1597
+ }
1598
+
1599
+ /*
1600
+ * call-seq:
1601
+ * Ferret.locale = "en_US.UTF-8"
1602
+ *
1603
+ * Set the global locale. You should use this method to set different locales
1604
+ * when indexing documents with different encodings.
1605
+ */
1606
+ static VALUE frt_set_locale(VALUE self, VALUE locale)
1607
+ {
1608
+ char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
1609
+ frt_locale = setlocale(LC_CTYPE, l);
1610
+ return frt_locale ? rb_str_new2(frt_locale) : Qnil;
1611
+ }
1612
+
1613
+ /****************************************************************************
1614
+ *
1615
+ * Init Functions
1616
+ *
1617
+ ****************************************************************************/
1618
+
1619
+ /*
1620
+ * Document-class: Ferret::Analysis::Token
1621
+ *
1622
+ * == Summary
1623
+ *
1624
+ * A Token is an occurrence of a term from the text of a field. It consists
1625
+ * of a term's text and the start and end offset of the term in the text of
1626
+ * the field;
1627
+ *
1628
+ * The start and end offsets permit applications to re-associate a token with
1629
+ * its source text, e.g., to display highlighted query terms in a document
1630
+ * browser, or to show matching text fragments in a KWIC (KeyWord In Context)
1631
+ * display, etc.
1632
+ *
1633
+ * === Attributes
1634
+ *
1635
+ * text:: the terms text which may have been modified by a Token Filter or
1636
+ * Tokenizer from the text originally found in the document
1637
+ * start:: is the position of the first character corresponding to
1638
+ * this token in the source text
1639
+ * end:: is equal to one greater than the position of the last
1640
+ * character corresponding of this token Note that the
1641
+ * difference between @end_offset and @start_offset may not be
1642
+ * equal to @text.length(), as the term text may have been
1643
+ * altered by a stemmer or some other filter.
1644
+ */
1645
+ static void Init_Token(void)
1646
+ {
1647
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1648
+ rb_define_alloc_func(cToken, frt_token_alloc);
1649
+ rb_include_module(cToken, rb_mComparable);
1650
+
1651
+ rb_define_method(cToken, "initialize", frt_token_init, -1);
1652
+ rb_define_method(cToken, "<=>", frt_token_cmp, 1);
1653
+ rb_define_method(cToken, "text", frt_token_get_text, 0);
1654
+ rb_define_method(cToken, "text=", frt_token_set_text, 1);
1655
+ rb_define_method(cToken, "start", frt_token_get_start_offset, 0);
1656
+ rb_define_method(cToken, "start=", frt_token_set_start_offset, 1);
1657
+ rb_define_method(cToken, "end", frt_token_get_end_offset, 0);
1658
+ rb_define_method(cToken, "end=", frt_token_set_end_offset, 1);
1659
+ rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
1660
+ rb_define_method(cToken, "pos_inc=", frt_token_set_pos_inc, 1);
1661
+ rb_define_method(cToken, "to_s", frt_token_to_s, 0);
1662
+ }
1663
+
1664
+ /*
1665
+ * Document-class: Ferret::Analysis::TokenStream
1666
+ *
1667
+ * A TokenStream enumerates the sequence of tokens, either from
1668
+ * fields of a document or from query text.
1669
+ *
1670
+ * This is an abstract class. Concrete subclasses are:
1671
+ *
1672
+ * Tokenizer:: a TokenStream whose input is a string
1673
+ * TokenFilter:: a TokenStream whose input is another TokenStream
1674
+ */
1675
+ static void Init_TokenStream(void)
1676
+ {
1677
+ cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1678
+ frt_mark_cclass(cTokenStream);
1679
+ rb_define_method(cTokenStream, "next", frt_ts_next, 0);
1680
+ rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
1681
+ rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
1682
+ }
1683
+
1684
+ /*
1685
+ * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1686
+ *
1687
+ * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1688
+ * That is to say, it defines tokens as maximal strings of adjacent letters,
1689
+ * as defined by the regular expression _/[A-Za-z]+/_.
1690
+ *
1691
+ * === Example
1692
+ *
1693
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1694
+ * => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
1695
+ */
1696
+ static void Init_AsciiLetterTokenizer(void)
1697
+ {
1698
+ cAsciiLetterTokenizer =
1699
+ rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1700
+ frt_mark_cclass(cAsciiLetterTokenizer);
1701
+ rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
1702
+ rb_define_method(cAsciiLetterTokenizer, "initialize",
1703
+ frt_a_letter_tokenizer_init, 1);
1704
+ }
1705
+
1706
+ /*
1707
+ * Document-class: Ferret::Analysis::LetterTokenizer
1708
+ *
1709
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That is
1710
+ * to say, it defines tokens as maximal strings of adjacent letters, as
1711
+ * defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
1712
+ * all characters in your local locale.
1713
+ *
1714
+ * === Example
1715
+ *
1716
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1717
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1718
+ */
1719
+ static void Init_LetterTokenizer(void)
1720
+ {
1721
+ cLetterTokenizer =
1722
+ rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1723
+ frt_mark_cclass(cLetterTokenizer);
1724
+ rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
1725
+ rb_define_method(cLetterTokenizer, "initialize",
1726
+ frt_letter_tokenizer_init, -1);
1727
+ }
1728
+
1729
+ /*
1730
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1731
+ *
1732
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1733
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1734
+ *
1735
+ * === Example
1736
+ *
1737
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1738
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1739
+ */
1740
+ static void Init_AsciiWhiteSpaceTokenizer(void)
1741
+ {
1742
+ cAsciiWhiteSpaceTokenizer =
1743
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1744
+ cTokenStream);
1745
+ frt_mark_cclass(cAsciiWhiteSpaceTokenizer);
1746
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
1747
+ rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1748
+ frt_a_whitespace_tokenizer_init, 1);
1749
+ }
1750
+
1751
+ /*
1752
+ * Document-class: Ferret::Analysis::WhiteSpaceTokenizer
1753
+ *
1754
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1755
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1756
+ *
1757
+ * === Example
1758
+ *
1759
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1760
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1761
+ */
1762
+ static void Init_WhiteSpaceTokenizer(void)
1763
+ {
1764
+ cWhiteSpaceTokenizer =
1765
+ rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1766
+ frt_mark_cclass(cWhiteSpaceTokenizer);
1767
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1768
+ rb_define_method(cWhiteSpaceTokenizer, "initialize",
1769
+ frt_whitespace_tokenizer_init, -1);
1770
+ }
1771
+
1772
+ /*
1773
+ * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1774
+ *
1775
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1776
+ * words correctly as well as tokenizing things like email addresses, web
1777
+ * addresses, phone numbers, etc.
1778
+ *
1779
+ * === Example
1780
+ *
1781
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1782
+ * => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
1783
+ */
1784
+ static void Init_AsciiStandardTokenizer(void)
1785
+ {
1786
+ cAsciiStandardTokenizer =
1787
+ rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1788
+ frt_mark_cclass(cAsciiStandardTokenizer);
1789
+ rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1790
+ rb_define_method(cAsciiStandardTokenizer, "initialize",
1791
+ frt_a_standard_tokenizer_init, 1);
1792
+ }
1793
+
1794
+ /*
1795
+ * Document-class: Ferret::Analysis::StandardTokenizer
1796
+ *
1797
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1798
+ * words correctly as well as tokenizing things like email addresses, web
1799
+ * addresses, phone numbers, etc.
1800
+ *
1801
+ * === Example
1802
+ *
1803
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1804
+ * => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
1805
+ */
1806
+ static void Init_StandardTokenizer(void)
1807
+ {
1808
+ cStandardTokenizer =
1809
+ rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1810
+ frt_mark_cclass(cStandardTokenizer);
1811
+ rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1812
+ rb_define_method(cStandardTokenizer, "initialize",
1813
+ frt_standard_tokenizer_init, 1);
1814
+ }
1815
+
1816
+ /*
1817
+ * Document-class: Ferret::Analysis::RegExpTokenizer
1818
+ *
1819
+ * A tokenizer that recognizes tokens based on a regular expression passed to
1820
+ * the constructor. Most possible tokenizers can be created using this class.
1821
+ *
1822
+ * === Example
1823
+ *
1824
+ * Below is an example of a simple implementation of a LetterTokenizer using
1825
+ * an RegExpTokenizer. Basically, a token is a sequence of alphabetic
1826
+ * characters separated by one or more non-alphabetic characters.
1827
+ *
1828
+ * # of course you would add more than just é
1829
+ * RegExpTokenizer.new(input, /[[:alpha:]é]+/)
1830
+ *
1831
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1832
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1833
+ */
1834
+ static void Init_RegExpTokenizer(void)
1835
+ {
1836
+ cRegExpTokenizer =
1837
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1838
+ frt_mark_cclass(cRegExpTokenizer);
1839
+ rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1840
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1841
+ rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1842
+ rb_define_method(cRegExpTokenizer, "initialize",
1843
+ frt_rets_init, -1);
1844
+ rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1845
+ rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1846
+ }
1847
+
1848
+ /***************/
1849
+ /*** Filters ***/
1850
+ /***************/
1851
+
1852
+ /*
1853
+ * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1854
+ *
1855
+ * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1856
+ * ASCII characters. For other characters use LowerCaseFilter.
1857
+ *
1858
+ * === Example
1859
+ *
1860
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
1861
+ *
1862
+ */
1863
+ static void Init_AsciiLowerCaseFilter(void)
1864
+ {
1865
+ cAsciiLowerCaseFilter =
1866
+ rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1867
+ frt_mark_cclass(cAsciiLowerCaseFilter);
1868
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1869
+ rb_define_method(cAsciiLowerCaseFilter, "initialize",
1870
+ frt_a_lowercase_filter_init, 1);
1871
+ }
1872
+
1873
+ /*
1874
+ * Document-class: Ferret::Analysis::LowerCaseFilter
1875
+ *
1876
+ * LowerCaseFilter normalizes a token's text to lowercase based on the
1877
+ * current locale.
1878
+ *
1879
+ * === Example
1880
+ *
1881
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
1882
+ *
1883
+ */
1884
+ static void Init_LowerCaseFilter(void)
1885
+ {
1886
+ cLowerCaseFilter =
1887
+ rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1888
+ frt_mark_cclass(cLowerCaseFilter);
1889
+ rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1890
+ rb_define_method(cLowerCaseFilter, "initialize",
1891
+ frt_lowercase_filter_init, 1);
1892
+ }
1893
+
1894
+ /*
1895
+ * Document-class: Ferret::Analysis::HyphenFilter
1896
+ *
1897
+ * HyphenFilter filters hyphenated words by adding both the word concatenated
1898
+ * into a single word and split into multiple words. ie "e-mail" becomes
1899
+ * "email" and "e mail". This way a search for "e-mail", "email" and "mail"
1900
+ * will all match. This filter is used by default by the StandardAnalyzer.
1901
+ *
1902
+ * === Example
1903
+ *
1904
+ * ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
1905
+ *
1906
+ */
1907
+ static void Init_HyphenFilter(void)
1908
+ {
1909
+ cHyphenFilter =
1910
+ rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
1911
+ frt_mark_cclass(cHyphenFilter);
1912
+ rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
1913
+ rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
1914
+ }
1915
+
1916
+ /*
1917
+ * Document-class: Ferret::Analysis::MappingFilter
1918
+ *
1919
+ * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
1920
+ * characters to ASCII characters for easier searching and better search
1921
+ * recall. The mapping is compiled into a Deterministic Finite Automata so it
1922
+ * is super fast. This Filter can therefor be used for indexing very large
1923
+ * datasets. Currently regular expressions are not supported. If you are
1924
+ * really interested in the feature, please contact me at dbalmain@gmail.com.
1925
+ *
1926
+ * == Example
1927
+ *
1928
+ * mapping = {
1929
+ * ['à','á','â','ã','ä','å','ā','ă'] => 'a',
1930
+ * 'æ' => 'ae',
1931
+ * ['ď','đ'] => 'd',
1932
+ * ['ç','ć','č','ĉ','ċ'] => 'c',
1933
+ * ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
1934
+ * ['ƒ'] => 'f',
1935
+ * ['ĝ','ğ','ġ','ģ'] => 'g',
1936
+ * ['ĥ','ħ'] => 'h',
1937
+ * ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
1938
+ * ['į','ı','ij','ĵ'] => 'j',
1939
+ * ['ķ','ĸ'] => 'k',
1940
+ * ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
1941
+ * ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
1942
+ * ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
1943
+ * ['œ'] => 'oek',
1944
+ * ['ą'] => 'q',
1945
+ * ['ŕ','ř','ŗ'] => 'r',
1946
+ * ['ś','š','ş','ŝ','ș'] => 's',
1947
+ * ['ť','ţ','ŧ','ț'] => 't',
1948
+ * ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
1949
+ * ['ŵ'] => 'w',
1950
+ * ['ý','ÿ','ŷ'] => 'y',
1951
+ * ['ž','ż','ź'] => 'z'
1952
+ * }
1953
+ * filt = MappingFilter.new(token_stream, mapping)
1954
+ */
1955
+ static void Init_MappingFilter(void)
1956
+ {
1957
+ cMappingFilter =
1958
+ rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
1959
+ frt_mark_cclass(cMappingFilter);
1960
+ rb_define_alloc_func(cMappingFilter, frt_data_alloc);
1961
+ rb_define_method(cMappingFilter, "initialize",
1962
+ frt_mapping_filter_init, 2);
1963
+ }
1964
+
1965
+ /*
1966
+ * Document-class: Ferret::Analysis::StopFilter
1967
+ *
1968
+ * A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
1969
+ * that you don't wish to be index. Usually they will be common words like
1970
+ * "the" and "and" although you can specify whichever words you want.
1971
+ *
1972
+ * === Example
1973
+ *
1974
+ * ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
1975
+ */
1976
+ static void Init_StopFilter(void)
1977
+ {
1978
+ cStopFilter =
1979
+ rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1980
+ frt_mark_cclass(cStopFilter);
1981
+ rb_define_alloc_func(cStopFilter, frt_data_alloc);
1982
+ rb_define_method(cStopFilter, "initialize",
1983
+ frt_stop_filter_init, -1);
1984
+ }
1985
+
1986
+ /*
1987
+ * Document-class: Ferret::Analysis::StemFilter
1988
+ *
1989
+ * == Summary
1990
+ *
1991
+ * A StemFilter takes a term and transforms the term as per the SnowBall
1992
+ * stemming algorithm. Note: the input to the stemming filter must already
1993
+ * be in lower case, so you will need to use LowerCaseFilter or lowercasing
1994
+ * Tokenizer further down the Tokenizer chain in order for this to work
1995
+ * properly!
1996
+ *
1997
+ * === Available algorithms and encodings
1998
+ *
1999
+ * Algorithm Algorithm Pseudonyms Encoding
2000
+ * ----------------------------------------------------------------
2001
+ * "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
2002
+ * "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
2003
+ * "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
2004
+ * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
2005
+ * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
2006
+ * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
2007
+ * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
2008
+ * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
2009
+ * "porter", | | "ISO_8859_1", "UTF_8"
2010
+ * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
2011
+ * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
2012
+ * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
2013
+ * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
2014
+ *
2015
+ * === Example
2016
+ *
2017
+ * To use this filter with other analyzers, you'll want to write an Analyzer
2018
+ * class that sets up the TokenStream chain as you want it. To use this with
2019
+ * a lowercasing Tokenizer, for example, you'd write an analyzer like this:
2020
+ *
2021
+ * def MyAnalyzer < Analyzer
2022
+ * def token_stream(field, str)
2023
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2024
+ * end
2025
+ * end
2026
+ *
2027
+ * "debate debates debated debating debater"
2028
+ * => ["debat", "debat", "debat", "debat", "debat"]
2029
+ *
2030
+ * === Attributes
2031
+ *
2032
+ * token_stream:: TokenStream to be filtered
2033
+ * algorithm:: The algorithm (or language) to use (default: "english")
2034
+ * encoding:: The encoding of the data (default: "UTF-8")
2035
+ */
2036
+ static void Init_StemFilter(void)
2037
+ {
2038
+ cStemFilter =
2039
+ rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
2040
+ frt_mark_cclass(cStemFilter);
2041
+ rb_define_alloc_func(cStemFilter, frt_data_alloc);
2042
+ rb_define_method(cStemFilter, "initialize",
2043
+ frt_stem_filter_init, -1);
2044
+ }
2045
+
2046
+ /*************************/
2047
+ /*** * * Analyzers * * ***/
2048
+ /*************************/
2049
+
2050
+ /*
2051
+ * Document-class: Ferret::Analysis::Analyzer
2052
+ *
2053
+ * == Summary
2054
+ *
2055
+ * An Analyzer builds TokenStreams, which analyze text. It thus represents
2056
+ * a policy for extracting index terms from text.
2057
+ *
2058
+ * Typical implementations first build a Tokenizer, which breaks the stream
2059
+ * of characters from the Reader into raw Tokens. One or more TokenFilters
2060
+ * may then be applied to the output of the Tokenizer.
2061
+ *
2062
+ * The default Analyzer just creates a LowerCaseTokenizer which converts
2063
+ * all text to lowercase tokens. See LowerCaseTokenizer for more details.
2064
+ *
2065
+ * === Example
2066
+ *
2067
+ * To create your own custom Analyzer you simply need to implement a
2068
+ * token_stream method which takes the field name and the data to be
2069
+ * tokenized as parameters and returns a TokenStream. Most analyzers
2070
+ * typically ignore the field name.
2071
+ *
2072
+ * Here we'll create a StemmingAnalyzer;
2073
+ *
2074
+ * def MyAnalyzer < Analyzer
2075
+ * def token_stream(field, str)
2076
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2077
+ * end
2078
+ * end
2079
+ */
2080
+ static void Init_Analyzer(void)
2081
+ {
2082
+ cAnalyzer =
2083
+ rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
2084
+ frt_mark_cclass(cAnalyzer);
2085
+ rb_define_alloc_func(cAnalyzer, frt_data_alloc);
2086
+ rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
2087
+ rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
2088
+ }
2089
+
2090
+ /*
2091
+ * Document-class: Ferret::Analysis::AsciiLetterAnalyzer
2092
+ *
2093
+ * == Summary
2094
+ *
2095
+ * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
2096
+ * maximal strings of ASCII characters. If implemented in Ruby it would look
2097
+ * like;
2098
+ *
2099
+ * class AsciiLetterAnalyzer
2100
+ * def initialize(lower = true)
2101
+ * @lower = lower
2102
+ * end
2103
+ *
2104
+ * def token_stream(field, str)
2105
+ * if @lower
2106
+ * return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
2107
+ * else
2108
+ * return AsciiLetterTokenizer.new(str)
2109
+ * end
2110
+ * end
2111
+ * end
2112
+ *
2113
+ * As you can see it makes use of the AsciiLetterTokenizer and
2114
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
2115
+ * characters so you should use the LetterAnalyzer is you want to analyze
2116
+ * multi-byte data like "UTF-8".
2117
+ */
2118
+ static void Init_AsciiLetterAnalyzer(void)
2119
+ {
2120
+ cAsciiLetterAnalyzer =
2121
+ rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
2122
+ frt_mark_cclass(cAsciiLetterAnalyzer);
2123
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
2124
+ rb_define_method(cAsciiLetterAnalyzer, "initialize",
2125
+ frt_a_letter_analyzer_init, -1);
2126
+ }
2127
+
2128
+ /*
2129
+ * Document-class: Ferret::Analysis::LetterAnalyzer
2130
+ *
2131
+ * == Summary
2132
+ *
2133
+ * A LetterAnalyzer creates a TokenStream that splits the input up into
2134
+ * maximal strings of characters as recognized by the current locale. If
2135
+ * implemented in Ruby it would look like;
2136
+ *
2137
+ * class LetterAnalyzer
2138
+ * def initialize(lower = true)
2139
+ * @lower = lower
2140
+ * end
2141
+ *
2142
+ * def token_stream(field, str)
2143
+ * return LetterTokenizer.new(str, @lower)
2144
+ * end
2145
+ * end
2146
+ *
2147
+ * As you can see it makes use of the LetterTokenizer.
2148
+ */
2149
+ static void Init_LetterAnalyzer(void)
2150
+ {
2151
+ cLetterAnalyzer =
2152
+ rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
2153
+ frt_mark_cclass(cLetterAnalyzer);
2154
+ rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
2155
+ rb_define_method(cLetterAnalyzer, "initialize",
2156
+ frt_letter_analyzer_init, -1);
2157
+ }
2158
+
2159
+ /*
2160
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
2161
+ *
2162
+ * == Summary
2163
+ *
2164
+ * The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
2165
+ * non-whitespace characters. If implemented in Ruby the
2166
+ * AsciiWhiteSpaceAnalyzer would look like;
2167
+ *
2168
+ * class AsciiWhiteSpaceAnalyzer
2169
+ * def initialize(lower = true)
2170
+ * @lower = lower
2171
+ * end
2172
+ *
2173
+ * def token_stream(field, str)
2174
+ * if @lower
2175
+ * return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
2176
+ * else
2177
+ * return AsciiWhiteSpaceTokenizer.new(str)
2178
+ * end
2179
+ * end
2180
+ * end
2181
+ *
2182
+ * As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
2183
+ * use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
2184
+ * as "UTF-8".
2185
+ */
2186
+ static void Init_AsciiWhiteSpaceAnalyzer(void)
2187
+ {
2188
+ cAsciiWhiteSpaceAnalyzer =
2189
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
2190
+ frt_mark_cclass(cAsciiWhiteSpaceAnalyzer);
2191
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
2192
+ rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
2193
+ frt_a_white_space_analyzer_init, -1);
2194
+ }
2195
+
2196
+ /*
2197
+ * Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
2198
+ *
2199
+ * == Summary
2200
+ *
2201
+ * The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
2202
+ * non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
2203
+ * would look like;
2204
+ *
2205
+ * class WhiteSpaceAnalyzer
2206
+ * def initialize(lower = true)
2207
+ * @lower = lower
2208
+ * end
2209
+ *
2210
+ * def token_stream(field, str)
2211
+ * return WhiteSpaceTokenizer.new(str, @lower)
2212
+ * end
2213
+ * end
2214
+ *
2215
+ * As you can see it makes use of the WhiteSpaceTokenizer.
2216
+ */
2217
+ static void Init_WhiteSpaceAnalyzer(void)
2218
+ {
2219
+ cWhiteSpaceAnalyzer =
2220
+ rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
2221
+ frt_mark_cclass(cWhiteSpaceAnalyzer);
2222
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
2223
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize",
2224
+ frt_white_space_analyzer_init, -1);
2225
+ }
2226
+
2227
+ /*
2228
+ * Document-class: Ferret::Analysis::AsciiStandardAnalyzer
2229
+ *
2230
+ * == Summary
2231
+ *
2232
+ * The AsciiStandardAnalyzer is the most advanced of the available
2233
+ * ASCII-analyzers. If it were implemented in Ruby it would look like this;
2234
+ *
2235
+ * class AsciiStandardAnalyzer
2236
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2237
+ * @lower = lower
2238
+ * @stop_words = stop_words
2239
+ * end
2240
+ *
2241
+ * def token_stream(field, str)
2242
+ * ts = AsciiStandardTokenizer.new(str)
2243
+ * ts = AsciiLowerCaseFilter.new(ts) if @lower
2244
+ * ts = StopFilter.new(ts, @stop_words)
2245
+ * ts = HyphenFilter.new(ts)
2246
+ * end
2247
+ * end
2248
+ *
2249
+ * As you can see it makes use of the AsciiStandardTokenizer and you can also
2250
+ * add your own list of stop-words if you wish. Note that this tokenizer
2251
+ * won't recognize non-ASCII characters so you should use the
2252
+ * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
2253
+ */
2254
+ static void Init_AsciiStandardAnalyzer(void)
2255
+ {
2256
+ cAsciiStandardAnalyzer =
2257
+ rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2258
+ frt_mark_cclass(cAsciiStandardAnalyzer);
2259
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
2260
+ rb_define_method(cAsciiStandardAnalyzer, "initialize",
2261
+ frt_a_standard_analyzer_init, -1);
2262
+ }
2263
+
2264
+ /*
2265
+ * Document-class: Ferret::Analysis::StandardAnalyzer
2266
+ *
2267
+ * == Summary
2268
+ *
2269
+ * The StandardAnalyzer is the most advanced of the available analyzers. If
2270
+ * it were implemented in Ruby it would look like this;
2271
+ *
2272
+ * class StandardAnalyzer
2273
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2274
+ * @lower = lower
2275
+ * @stop_words = stop_words
2276
+ * end
2277
+ *
2278
+ * def token_stream(field, str)
2279
+ * ts = StandardTokenizer.new(str)
2280
+ * ts = LowerCaseFilter.new(ts) if @lower
2281
+ * ts = StopFilter.new(ts, @stop_words)
2282
+ * ts = HyphenFilter.new(ts)
2283
+ * end
2284
+ * end
2285
+ *
2286
+ * As you can see it makes use of the StandardTokenizer and you can also add
2287
+ * your own list of stopwords if you wish.
2288
+ */
2289
+ static void Init_StandardAnalyzer(void)
2290
+ {
2291
+ cStandardAnalyzer =
2292
+ rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2293
+ frt_mark_cclass(cStandardAnalyzer);
2294
+ rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
2295
+ rb_define_method(cStandardAnalyzer, "initialize",
2296
+ frt_standard_analyzer_init, -1);
2297
+ }
2298
+
2299
+ /*
2300
+ * Document-class: Ferret::Analysis::PerFieldAnalyzer
2301
+ *
2302
+ * == Summary
2303
+ *
2304
+ * The PerFieldAnalyzer is for use when you want to analyze different fields
2305
+ * with different analyzers. With the PerFieldAnalyzer you can specify how
2306
+ * you want each field analyzed.
2307
+ *
2308
+ * === Example
2309
+ *
2310
+ * # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
2311
+ * pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
2312
+ *
2313
+ * # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
2314
+ * pfa[:title] = WhiteSpaceAnalyzer.new(false)
2315
+ *
2316
+ * # Use a custom analyzer on the :created_at field
2317
+ * pfa[:created_at] = DateAnalyzer.new
2318
+ */
2319
+ static void Init_PerFieldAnalyzer(void)
2320
+ {
2321
+ cPerFieldAnalyzer =
2322
+ rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2323
+ frt_mark_cclass(cPerFieldAnalyzer);
2324
+ rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
2325
+ rb_define_method(cPerFieldAnalyzer, "initialize",
2326
+ frt_per_field_analyzer_init, 1);
2327
+ rb_define_method(cPerFieldAnalyzer, "add_field",
2328
+ frt_per_field_analyzer_add_field, 2);
2329
+ rb_define_method(cPerFieldAnalyzer, "[]=",
2330
+ frt_per_field_analyzer_add_field, 2);
2331
+ rb_define_method(cPerFieldAnalyzer, "token_stream",
2332
+ frt_pfa_analyzer_token_stream, 2);
2333
+ }
2334
+
2335
+ /*
2336
+ * Document-class: Ferret::Analysis::RegExpAnalyzer
2337
+ *
2338
+ * == Summary
2339
+ *
2340
+ * Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
2341
+ * implemented in Ruby it would look like this;
2342
+ *
2343
+ * class RegExpAnalyzer
2344
+ * def initialize(reg_exp, lower = true)
2345
+ * @lower = lower
2346
+ * @reg_exp = reg_exp
2347
+ * end
2348
+ *
2349
+ * def token_stream(field, str)
2350
+ * if @lower
2351
+ * return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
2352
+ * else
2353
+ * return RegExpTokenizer.new(str, reg_exp)
2354
+ * end
2355
+ * end
2356
+ * end
2357
+ *
2358
+ * === Example
2359
+ *
2360
+ * csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
2361
+ */
2362
+ static void Init_RegExpAnalyzer(void)
2363
+ {
2364
+ cRegExpAnalyzer =
2365
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2366
+ frt_mark_cclass(cRegExpAnalyzer);
2367
+ rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2368
+ rb_define_method(cRegExpAnalyzer, "initialize",
2369
+ frt_re_analyzer_init, -1);
2370
+ rb_define_method(cRegExpAnalyzer, "token_stream",
2371
+ frt_re_analyzer_token_stream, 2);
2372
+ }
2373
+
2374
+ /* rdoc hack
2375
+ extern VALUE mFerret = rb_define_module("Ferret");
2376
+ */
2377
+
2378
+ /*
2379
+ * Document-module: Ferret::Analysis
2380
+ *
2381
+ * == Summary
2382
+ *
2383
+ * The Analysis module contains all the classes used to analyze and tokenize
2384
+ * the data to be indexed. There are three main classes you need to know
2385
+ * about when dealing with analysis; Analyzer, TokenStream and Token.
2386
+ *
2387
+ * == Classes
2388
+ *
2389
+ * === Analyzer
2390
+ *
2391
+ * Analyzers handle all of your tokenizing needs. You pass an Analyzer to the
2392
+ * indexing class when you create it and it will create the TokenStreams
2393
+ * necessary to tokenize the fields in the documents. Most of the time you
2394
+ * won't need to worry about TokenStreams and Tokens, one of the Analyzers
2395
+ * distributed with Ferret will do exactly what you need. Otherwise you'll
2396
+ * need to implement a custom analyzer.
2397
+ *
2398
+ * === TokenStream
2399
+ *
2400
+ * A TokenStream is an enumeration of Tokens. There are two standard types of
2401
+ * TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
2402
+ * turns it into a list of Tokens. A TokenFilter takes another TokenStream
2403
+ * and post-processes the Tokens. You can chain as many TokenFilters together
2404
+ * as you like but they always need to finish with a Tokenizer.
2405
+ *
2406
+ * === Token
2407
+ *
2408
+ * A Token is a single term from a document field. A token contains the text
2409
+ * representing the term as well as the start and end offset of the token.
2410
+ * The start and end offset will represent the token as it appears in the
2411
+ * source field. Some TokenFilters may change the text in the Token but the
2412
+ * start and end offsets should stay the same so (end - start) won't
2413
+ * necessarily be equal to the length of text in the token. For example using
2414
+ * a stemming TokenFilter the term "Beginning" might have start and end
2415
+ * offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
2416
+ * might be "begin" (after stemming).
2417
+ */
2418
+ void
2419
+ Init_Analysis(void)
2420
+ {
2421
+ mAnalysis = rb_define_module_under(mFerret, "Analysis");
2422
+
2423
+ /* TokenStream Methods */
2424
+ id_next = rb_intern("next");
2425
+ id_reset = rb_intern("text=");
2426
+ id_clone = rb_intern("clone");
2427
+ id_text = rb_intern("@text");
2428
+
2429
+ /* Analyzer Methods */
2430
+ id_token_stream = rb_intern("token_stream");
2431
+
2432
+ object_space = rb_hash_new();
2433
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2434
+
2435
+ /*** * * Locale stuff * * ***/
2436
+ rb_define_singleton_method(mFerret, "locale=", frt_set_locale, 1);
2437
+ rb_define_singleton_method(mFerret, "locale", frt_get_locale, 0);
2438
+
2439
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2440
+ get_rstopwords(ENGLISH_STOP_WORDS));
2441
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
2442
+ get_rstopwords(FULL_ENGLISH_STOP_WORDS));
2443
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
2444
+ get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
2445
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
2446
+ get_rstopwords(FULL_FRENCH_STOP_WORDS));
2447
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
2448
+ get_rstopwords(FULL_SPANISH_STOP_WORDS));
2449
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
2450
+ get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
2451
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
2452
+ get_rstopwords(FULL_ITALIAN_STOP_WORDS));
2453
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
2454
+ get_rstopwords(FULL_GERMAN_STOP_WORDS));
2455
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
2456
+ get_rstopwords(FULL_DUTCH_STOP_WORDS));
2457
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
2458
+ get_rstopwords(FULL_SWEDISH_STOP_WORDS));
2459
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
2460
+ get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
2461
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
2462
+ get_rstopwords(FULL_DANISH_STOP_WORDS));
2463
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
2464
+ get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
2465
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2466
+ get_rstopwords(FULL_FINNISH_STOP_WORDS));
2467
+
2468
+ Init_Token();
2469
+ Init_TokenStream();
2470
+
2471
+ Init_AsciiLetterTokenizer();
2472
+ Init_LetterTokenizer();
2473
+
2474
+ Init_AsciiWhiteSpaceTokenizer();
2475
+ Init_WhiteSpaceTokenizer();
2476
+
2477
+ Init_AsciiStandardTokenizer();
2478
+ Init_StandardTokenizer();
2479
+
2480
+ Init_RegExpTokenizer();
2481
+
2482
+ Init_AsciiLowerCaseFilter();
2483
+ Init_LowerCaseFilter();
2484
+ Init_HyphenFilter();
2485
+ Init_StopFilter();
2486
+ Init_MappingFilter();
2487
+ Init_StemFilter();
2488
+
2489
+ Init_Analyzer();
2490
+ Init_AsciiLetterAnalyzer();
2491
+ Init_LetterAnalyzer();
2492
+ Init_AsciiWhiteSpaceAnalyzer();
2493
+ Init_WhiteSpaceAnalyzer();
2494
+ Init_AsciiStandardAnalyzer();
2495
+ Init_StandardAnalyzer();
2496
+ Init_PerFieldAnalyzer();
2497
+ Init_RegExpAnalyzer();
2498
+
2499
+ }