jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/r_analysis.c ADDED
@@ -0,0 +1,2626 @@
1
+ #include "lang.h"
2
+ #ifdef FRT_RUBY_VERSION_1_9
3
+ # include <ruby/re.h>
4
+ #else
5
+ # include <regex.h>
6
+ #endif
7
+ #include <locale.h>
8
+ #ifdef FRT_RUBY_VERSION_1_9
9
+ # include <ruby/st.h>
10
+ #else
11
+ # include <st.h>
12
+ #endif
13
+ #include "ferret.h"
14
+ #include "analysis.h"
15
+
16
+ static char *frb_locale = NULL;
17
+
18
+ static VALUE mAnalysis;
19
+
20
+ static VALUE cToken;
21
+ static VALUE cAsciiLetterTokenizer;
22
+ static VALUE cLetterTokenizer;
23
+ static VALUE cAsciiWhiteSpaceTokenizer;
24
+ static VALUE cWhiteSpaceTokenizer;
25
+ static VALUE cAsciiStandardTokenizer;
26
+ static VALUE cStandardTokenizer;
27
+ static VALUE cRegExpTokenizer;
28
+
29
+ static VALUE cAsciiLowerCaseFilter;
30
+ static VALUE cLowerCaseFilter;
31
+ static VALUE cStopFilter;
32
+ static VALUE cMappingFilter;
33
+ static VALUE cHyphenFilter;
34
+ static VALUE cStemFilter;
35
+
36
+ static VALUE cAnalyzer;
37
+ static VALUE cAsciiLetterAnalyzer;
38
+ static VALUE cLetterAnalyzer;
39
+ static VALUE cAsciiWhiteSpaceAnalyzer;
40
+ static VALUE cWhiteSpaceAnalyzer;
41
+ static VALUE cAsciiStandardAnalyzer;
42
+ static VALUE cStandardAnalyzer;
43
+ static VALUE cPerFieldAnalyzer;
44
+ static VALUE cRegExpAnalyzer;
45
+
46
+ static VALUE cTokenStream;
47
+
48
+ /* TokenStream Methods */
49
+ static ID id_next;
50
+ static ID id_reset;
51
+ static ID id_clone;
52
+ static ID id_text;
53
+
54
+ /* Analyzer Methods */
55
+ static ID id_token_stream;
56
+
57
+ static VALUE object_space;
58
+
59
+ #ifndef FRT_RUBY_VERSION_1_9
60
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
61
+ int, struct re_registers *);
62
+ #endif
63
+
64
+ int
65
+ frb_rb_hash_size(VALUE hash)
66
+ {
67
+ #ifdef FRT_RUBY_VERSION_1_9
68
+ return RHASH(hash)->ntbl->num_entries;
69
+ #else
70
+ return RHASH(hash)->tbl->num_entries;
71
+ #endif
72
+ }
73
+
74
+ /****************************************************************************
75
+ *
76
+ * Utility Methods
77
+ *
78
+ ****************************************************************************/
79
+
80
+ static char **
81
+ get_stopwords(VALUE rstop_words)
82
+ {
83
+ char **stop_words;
84
+ int i, len;
85
+ VALUE rstr;
86
+ Check_Type(rstop_words, T_ARRAY);
87
+ len = RARRAY_LEN(rstop_words);
88
+ stop_words = ALLOC_N(char *, RARRAY_LEN(rstop_words) + 1);
89
+ stop_words[len] = NULL;
90
+ for (i = 0; i < len; i++) {
91
+ rstr = rb_obj_as_string(RARRAY_PTR(rstop_words)[i]);
92
+ stop_words[i] = rs2s(rstr);
93
+ }
94
+ return stop_words;
95
+ }
96
+
97
+ /****************************************************************************
98
+ *
99
+ * token methods
100
+ *
101
+ ****************************************************************************/
102
+
103
+ typedef struct RToken {
104
+ VALUE text;
105
+ int start;
106
+ int end;
107
+ int pos_inc;
108
+ } RToken;
109
+
110
+ static void
111
+ frb_token_free(void *p)
112
+ {
113
+ free(p);
114
+ }
115
+
116
+ static void
117
+ frb_token_mark(void *p)
118
+ {
119
+ RToken *token = (RToken *)p;
120
+ rb_gc_mark(token->text);
121
+ }
122
+
123
+ static VALUE
124
+ frb_token_alloc(VALUE klass)
125
+ {
126
+ return Data_Wrap_Struct(klass, &frb_token_mark, &frb_token_free,
127
+ ALLOC(RToken));
128
+ }
129
+
130
+ static VALUE
131
+ get_token(Token *tk)
132
+ {
133
+ RToken *token = ALLOC(RToken);
134
+
135
+ token->text = rb_str_new2(tk->text);
136
+ token->start = tk->start;
137
+ token->end = tk->end;
138
+ token->pos_inc = tk->pos_inc;
139
+ return Data_Wrap_Struct(cToken, &frb_token_mark, &frb_token_free, token);
140
+ }
141
+
142
+ Token *
143
+ frb_set_token(Token *tk, VALUE rt)
144
+ {
145
+ RToken *rtk;
146
+
147
+ if (rt == Qnil) return NULL;
148
+
149
+ Data_Get_Struct(rt, RToken, rtk);
150
+ tk_set(tk, rs2s(rtk->text), RSTRING_LEN(rtk->text),
151
+ rtk->start, rtk->end, rtk->pos_inc);
152
+ return tk;
153
+ }
154
+
155
+ #define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
156
+
157
+ /*
158
+ * call-seq:
159
+ * Token.new(text, start, end, pos_inc = 1) -> new Token
160
+ *
161
+ * Creates a new token setting the text, start and end offsets of the token
162
+ * and the position increment for the token.
163
+ *
164
+ * The position increment is usually set to 1 but you can set it to other
165
+ * values as needed. For example, if you have a stop word filter you will be
166
+ * skipping tokens. Let's say you have the stop words "the" and "and" and you
167
+ * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
168
+ * "Sea" will have the position increments 2, 1 and 3 respectively.
169
+ *
170
+ * Another reason you might want to vary the position increment is if you are
171
+ * adding synonyms to the index. For example let's say you have the synonym
172
+ * group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
173
+ * speedy delivery", you'll add "speedy" first with a position increment of 1
174
+ * and then "fast" and "quick" with position increments of 0 since they are
175
+ * represented in the same position.
176
+ *
177
+ * The offset set values +start+ and +end+ should be byte offsets, not
178
+ * character offsets. This makes it easy to use those offsets to quickly
179
+ * access the token in the input string and also to insert highlighting tags
180
+ * when necessary.
181
+ *
182
+ * text:: the main text for the token.
183
+ * start:: the start offset of the token in bytes.
184
+ * end:: the end offset of the token in bytes.
185
+ * pos_inc:: the position increment of a token. See above.
186
+ * return:: a newly created and assigned Token object
187
+ */
188
+ static VALUE
189
+ frb_token_init(int argc, VALUE *argv, VALUE self)
190
+ {
191
+ RToken *token;
192
+ VALUE rtext, rstart, rend, rpos_inc, rtype;
193
+ GET_TK(token, self);
194
+ token->pos_inc = 1;
195
+ switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
196
+ &rend, &rpos_inc, &rtype)) {
197
+ case 5: /* type gets ignored at this stage */
198
+ case 4: token->pos_inc = FIX2INT(rpos_inc);
199
+ }
200
+ token->text = rb_obj_as_string(rtext);
201
+ token->start = FIX2INT(rstart);
202
+ token->end = FIX2INT(rend);
203
+ return self;
204
+ }
205
+
206
+ /*
207
+ * call-seq:
208
+ * token.cmp(other_token) -> bool
209
+ *
210
+ * Used to compare two tokens. Token is extended by Comparable so you can
211
+ * also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
212
+ *
213
+ * Tokens are sorted by the position in the text at which they occur, ie
214
+ * the start offset. If two tokens have the same start offset, (see
215
+ * pos_inc=) then, they are sorted by the end offset and then
216
+ * lexically by the token text.
217
+ */
218
+ static VALUE
219
+ frb_token_cmp(VALUE self, VALUE rother)
220
+ {
221
+ RToken *token, *other;
222
+ int cmp;
223
+ GET_TK(token, self);
224
+ GET_TK(other, rother);
225
+ if (token->start > other->start) {
226
+ cmp = 1;
227
+ } else if (token->start < other->start) {
228
+ cmp = -1;
229
+ } else {
230
+ if (token->end > other->end) {
231
+ cmp = 1;
232
+ } else if (token->end < other->end) {
233
+ cmp = -1;
234
+ } else {
235
+ cmp = strcmp(rs2s(token->text), rs2s(other->text));
236
+ }
237
+ }
238
+ return INT2FIX(cmp);
239
+ }
240
+
241
+ /*
242
+ * call-seq:
243
+ * token.text -> text
244
+ *
245
+ * Returns the text that this token represents
246
+ */
247
+ static VALUE
248
+ frb_token_get_text(VALUE self)
249
+ {
250
+ RToken *token;
251
+ GET_TK(token, self);
252
+ return token->text;
253
+ }
254
+
255
+ /*
256
+ * call-seq:
257
+ * token.text = text -> text
258
+ *
259
+ * Set the text for this token.
260
+ */
261
+ static VALUE
262
+ frb_token_set_text(VALUE self, VALUE rtext)
263
+ {
264
+ RToken *token;
265
+ GET_TK(token, self);
266
+ token->text = rtext;
267
+ return rtext;
268
+ }
269
+
270
+ /*
271
+ * call-seq:
272
+ * token.start -> integer
273
+ *
274
+ * Start byte-position of this token
275
+ */
276
+ static VALUE
277
+ frb_token_get_start_offset(VALUE self)
278
+ {
279
+ RToken *token;
280
+ GET_TK(token, self);
281
+ return INT2FIX(token->start);
282
+ }
283
+
284
+ /*
285
+ * call-seq:
286
+ * token.end -> integer
287
+ *
288
+ * End byte-position of this token
289
+ */
290
+ static VALUE
291
+ frb_token_get_end_offset(VALUE self)
292
+ {
293
+ RToken *token;
294
+ GET_TK(token, self);
295
+ return INT2FIX(token->end);
296
+ }
297
+
298
+ /*
299
+ * call-seq:
300
+ * token.pos_inc -> integer
301
+ *
302
+ * Position Increment for this token
303
+ */
304
+ static VALUE
305
+ frb_token_get_pos_inc(VALUE self)
306
+ {
307
+ RToken *token;
308
+ GET_TK(token, self);
309
+ return INT2FIX(token->pos_inc);
310
+ }
311
+
312
+ /*
313
+ * call-seq:
314
+ * token.start = start -> integer
315
+ *
316
+ * Set start byte-position of this token
317
+ */
318
+ static VALUE
319
+ frb_token_set_start_offset(VALUE self, VALUE rstart)
320
+ {
321
+ RToken *token;
322
+ GET_TK(token, self);
323
+ token->start = FIX2INT(rstart);
324
+ return rstart;
325
+ }
326
+
327
+ /*
328
+ * call-seq:
329
+ * token.end = end -> integer
330
+ *
331
+ * Set end byte-position of this token
332
+ */
333
+ static VALUE
334
+ frb_token_set_end_offset(VALUE self, VALUE rend)
335
+ {
336
+ RToken *token;
337
+ GET_TK(token, self);
338
+ token->end = FIX2INT(rend);
339
+ return rend;
340
+ }
341
+
342
+ /*
343
+ * call-seq:
344
+ * token.pos_inc = pos_inc -> integer
345
+ *
346
+ * Set the position increment. This determines the position of this token
347
+ * relative to the previous Token in a TokenStream, used in phrase
348
+ * searching.
349
+ *
350
+ * The default value is 1.
351
+ *
352
+ * Some common uses for this are:
353
+ *
354
+ * * Set it to zero to put multiple terms in the same position. This is
355
+ * useful if, e.g., a word has multiple stems. Searches for phrases
356
+ * including either stem will match. In this case, all but the first
357
+ * stem's increment should be set to zero: the increment of the first
358
+ * instance should be one. Repeating a token with an increment of zero
359
+ * can also be used to boost the scores of matches on that token.
360
+ *
361
+ * * Set it to values greater than one to inhibit exact phrase matches.
362
+ * If, for example, one does not want phrases to match across removed
363
+ * stop words, then one could build a stop word filter that removes stop
364
+ * words and also sets the increment to the number of stop words removed
365
+ * before each non-stop word. Then exact phrase queries will only match
366
+ * when the terms occur with no intervening stop words.
367
+ *
368
+ */
369
+ static VALUE
370
+ frb_token_set_pos_inc(VALUE self, VALUE rpos_inc)
371
+ {
372
+ RToken *token;
373
+ GET_TK(token, self);
374
+ token->pos_inc = FIX2INT(rpos_inc);
375
+ return rpos_inc;
376
+ }
377
+
378
+ /*
379
+ * call-seq:
380
+ * token.to_s -> token_str
381
+ *
382
+ * Return a string representation of the token
383
+ */
384
+ static VALUE
385
+ frb_token_to_s(VALUE self)
386
+ {
387
+ RToken *token;
388
+ char *buf;
389
+ GET_TK(token, self);
390
+ buf = alloca(RSTRING_LEN(token->text) + 80);
391
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
392
+ token->start, token->end, token->pos_inc);
393
+ return rb_str_new2(buf);
394
+ }
395
+
396
+ /****************************************************************************
397
+ *
398
+ * TokenStream Methods
399
+ *
400
+ ****************************************************************************/
401
+
402
+ #define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
403
+
404
+ static void
405
+ frb_ts_mark(void *p)
406
+ {
407
+ TokenStream *ts = (TokenStream *)p;
408
+ if (ts->text) frb_gc_mark(&ts->text);
409
+ }
410
+
411
+ static void
412
+ frb_ts_free(TokenStream *ts)
413
+ {
414
+ if (object_get(&ts->text) != Qnil) {
415
+ object_del(&ts->text);
416
+ }
417
+ object_del(ts);
418
+ ts_deref(ts);
419
+ }
420
+
421
+ static void frb_rets_free(TokenStream *ts);
422
+ static void frb_rets_mark(TokenStream *ts);
423
+ static Token *rets_next(TokenStream *ts);
424
+
425
+ static VALUE
426
+ get_rb_token_stream(TokenStream *ts)
427
+ {
428
+ VALUE rts = object_get(ts);
429
+ if (rts == Qnil) {
430
+ if (ts->next == &rets_next) {
431
+ rts = Data_Wrap_Struct(cTokenStream, &frb_rets_mark,
432
+ &frb_rets_free, ts);
433
+ } else {
434
+ rts = Data_Wrap_Struct(cTokenStream, &frb_ts_mark,
435
+ &frb_ts_free, ts);
436
+ }
437
+ object_add(ts, rts);
438
+ }
439
+ return rts;
440
+ }
441
+
442
+ static INLINE VALUE
443
+ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
444
+ {
445
+ StringValue(rstr);
446
+ ts->reset(ts, rs2s(rstr));
447
+ Frt_Wrap_Struct(self, &frb_ts_mark, &frb_ts_free, ts);
448
+ object_add(&ts->text, rstr);
449
+ object_add(ts, self);
450
+ return self;
451
+ }
452
+
453
+ /*
454
+ * call-seq:
455
+ * token_stream.text = text -> text
456
+ *
457
+ * Set the text attribute of the TokenStream to the text you wish to be
458
+ * tokenized. For example, you may do this;
459
+ *
460
+ * token_stream.text = File.read(file_name)
461
+ */
462
+ static VALUE
463
+ frb_ts_set_text(VALUE self, VALUE rtext)
464
+ {
465
+ TokenStream *ts;
466
+ Data_Get_Struct(self, TokenStream, ts);
467
+ StringValue(rtext);
468
+ ts->reset(ts, rs2s(rtext));
469
+
470
+ /* prevent garbage collection */
471
+ rb_ivar_set(self, id_text, rtext);
472
+
473
+ return rtext;
474
+ }
475
+
476
+ /*
477
+ * call-seq:
478
+ * token_stream.text = text -> text
479
+ *
480
+ * Return the text that the TokenStream is tokenizing
481
+ */
482
+ static VALUE
483
+ frb_ts_get_text(VALUE self)
484
+ {
485
+ VALUE rtext = Qnil;
486
+ TokenStream *ts;
487
+ Data_Get_Struct(self, TokenStream, ts);
488
+ if ((rtext = object_get(&ts->text)) == Qnil) {
489
+ if (ts->text) {
490
+ rtext = rb_str_new2(ts->text);
491
+ object_set(&ts->text, rtext);
492
+ }
493
+ }
494
+ return rtext;
495
+ }
496
+
497
+ /*
498
+ * call-seq:
499
+ * token_stream.next -> token
500
+ *
501
+ * Return the next token from the TokenStream or nil if there are no more
502
+ * tokens.
503
+ */
504
+ static VALUE
505
+ frb_ts_next(VALUE self)
506
+ {
507
+ TokenStream *ts;
508
+ Token *next;
509
+ GET_TS(ts, self);
510
+ next = ts->next(ts);
511
+ if (next == NULL) {
512
+ return Qnil;
513
+ }
514
+
515
+ return get_token(next);
516
+ }
517
+
518
+ /****************************************************************************
519
+ * TokenFilter
520
+ ****************************************************************************/
521
+
522
+ #define TkFilt(filter) ((TokenFilter *)(filter))
523
+
524
+ static void
525
+ frb_tf_mark(void *p)
526
+ {
527
+ TokenStream *ts = (TokenStream *)p;
528
+ if (TkFilt(ts)->sub_ts) {
529
+ frb_gc_mark(&TkFilt(ts)->sub_ts);
530
+ }
531
+ }
532
+
533
+ static void
534
+ frb_tf_free(TokenStream *ts)
535
+ {
536
+ if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
537
+ object_del(&TkFilt(ts)->sub_ts);
538
+ }
539
+ object_del(ts);
540
+ ts_deref(ts);
541
+ }
542
+
543
+
544
+ /****************************************************************************
545
+ * CWrappedTokenStream
546
+ ****************************************************************************/
547
+
548
+ #define CachedTS(token_stream) ((CachedTokenStream *)(token_stream))
549
+ #define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
550
+
551
+ typedef struct CWrappedTokenStream {
552
+ CachedTokenStream super;
553
+ VALUE rts;
554
+ } CWrappedTokenStream;
555
+
556
+ static void
557
+ cwrts_destroy_i(TokenStream *ts)
558
+ {
559
+ if (object_get(&ts->text) != Qnil) {
560
+ object_del(&ts->text);
561
+ }
562
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
563
+ /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
564
+ free(ts);
565
+ }
566
+
567
+ static Token *
568
+ cwrts_next(TokenStream *ts)
569
+ {
570
+ VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
571
+ return frb_set_token(&(CachedTS(ts)->token), rtoken);
572
+ }
573
+
574
+ static TokenStream *
575
+ cwrts_reset(TokenStream *ts, char *text)
576
+ {
577
+ ts->t = ts->text = text;
578
+ rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
579
+ return ts;
580
+ }
581
+
582
+ static TokenStream *
583
+ cwrts_clone_i(TokenStream *orig_ts)
584
+ {
585
+ TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
586
+ VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
587
+ rb_hash_aset(object_space, ((VALUE)new_ts)|1, rts);
588
+ return new_ts;
589
+ }
590
+
591
+ static TokenStream *
592
+ frb_get_cwrapped_rts(VALUE rts)
593
+ {
594
+ TokenStream *ts;
595
+ if (frb_is_cclass(rts) && DATA_PTR(rts)) {
596
+ GET_TS(ts, rts);
597
+ REF(ts);
598
+ }
599
+ else {
600
+ ts = ts_new(CWrappedTokenStream);
601
+ CWTS(ts)->rts = rts;
602
+ ts->next = &cwrts_next;
603
+ ts->reset = &cwrts_reset;
604
+ ts->clone_i = &cwrts_clone_i;
605
+ ts->destroy_i = &cwrts_destroy_i;
606
+ /* prevent from being garbage collected */
607
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rts);
608
+ ts->ref_cnt = 1;
609
+ }
610
+ return ts;
611
+ }
612
+
613
+ /****************************************************************************
614
+ * RegExpTokenStream
615
+ ****************************************************************************/
616
+
617
+ #define P "[_\\/.,-]"
618
+ #define HASDIGIT "\\w*\\d\\w*"
619
+ #define ALPHA "[-_[:alpha:]]"
620
+ #define ALNUM "[-_[:alnum:]]"
621
+
622
+ #define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
623
+
624
+ static const char *TOKEN_RE =
625
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
626
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
627
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
628
+ "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
629
+ "|(\\.\\w+)+"
630
+ "|"
631
+ ")";
632
+ static VALUE rtoken_re;
633
+
634
+ typedef struct RegExpTokenStream {
635
+ CachedTokenStream super;
636
+ VALUE rtext;
637
+ VALUE regex;
638
+ VALUE proc;
639
+ long curr_ind;
640
+ } RegExpTokenStream;
641
+
642
+ static void
643
+ rets_destroy_i(TokenStream *ts)
644
+ {
645
+ if (object_get(&ts->text) != Qnil) {
646
+ object_del(&ts->text);
647
+ }
648
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
649
+ /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
650
+ free(ts);
651
+ }
652
+
653
+ static void
654
+ frb_rets_free(TokenStream *ts)
655
+ {
656
+ if (object_get(&ts->text) != Qnil) {
657
+ object_del(&ts->text);
658
+ }
659
+ object_del(ts);
660
+ ts_deref(ts);
661
+ }
662
+
663
+ static void
664
+ frb_rets_mark(TokenStream *ts)
665
+ {
666
+ if (ts->text) frb_gc_mark(&ts->text);
667
+ rb_gc_mark(RETS(ts)->rtext);
668
+ rb_gc_mark(RETS(ts)->regex);
669
+ rb_gc_mark(RETS(ts)->proc);
670
+ }
671
+
672
+ /*
673
+ * call-seq:
674
+ * tokenizer.text = text -> text
675
+ *
676
+ * Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
677
+ * tokenize the text from the beginning.
678
+ */
679
+ static VALUE
680
+ frb_rets_set_text(VALUE self, VALUE rtext)
681
+ {
682
+ TokenStream *ts;
683
+ GET_TS(ts, self);
684
+
685
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
686
+ StringValue(rtext);
687
+ RETS(ts)->rtext = rtext;
688
+ RETS(ts)->curr_ind = 0;
689
+
690
+ return rtext;
691
+ }
692
+
693
+ /*
694
+ * call-seq:
695
+ * tokenizer.text = text -> text
696
+ *
697
+ * Get the text being tokenized by the tokenizer.
698
+ */
699
+ static VALUE
700
+ frb_rets_get_text(VALUE self)
701
+ {
702
+ TokenStream *ts;
703
+ GET_TS(ts, self);
704
+ return RETS(ts)->rtext;
705
+ }
706
+
707
+ #ifdef FRT_RUBY_VERSION_1_9
708
+
709
+ // partly lifted from ruby 1.9 string.c
710
+ #include <ruby/encoding.h>
711
+ #define BEG(no) regs->beg[no]
712
+ #define END(no) regs->end[no]
713
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
714
+ static VALUE
715
+ scan_once(VALUE str, VALUE pat, long *start)
716
+ {
717
+ VALUE match;
718
+ struct re_registers *regs;
719
+
720
+ if (rb_reg_search(pat, str, *start, 0) >= 0) {
721
+ match = rb_backref_get();
722
+ regs = RMATCH_REGS(match);
723
+ if (BEG(0) == END(0)) {
724
+ rb_encoding *enc = STR_ENC_GET(str);
725
+ /*
726
+ * Always consume at least one character of the input string
727
+ */
728
+ if (RSTRING_LEN(str) > END(0))
729
+ *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
730
+ RSTRING_END(str), enc);
731
+ else
732
+ *start = END(0)+1;
733
+ }
734
+ else {
735
+ *start = END(0);
736
+ }
737
+ return rb_reg_nth_match(0, match);
738
+ }
739
+ return Qnil;
740
+ }
741
+ //
742
+
743
+ static Token *
744
+ rets_next(TokenStream *ts)
745
+ {
746
+ VALUE ret;
747
+ long rtok_len;
748
+ int beg, end;
749
+ Check_Type(RETS(ts)->regex, T_REGEXP);
750
+ ret = scan_once(RETS(ts)->rtext, RETS(ts)->regex, &(RETS(ts)->curr_ind));
751
+ if (NIL_P(ret)) return NULL;
752
+
753
+ Check_Type(ret, T_STRING);
754
+ rtok_len = RSTRING_LEN(ret);
755
+ beg = RETS(ts)->curr_ind - rtok_len;
756
+ end = RETS(ts)->curr_ind;
757
+
758
+ if (NIL_P(RETS(ts)->proc)) {
759
+ return tk_set(&(CachedTS(ts)->token), rs2s(ret), rtok_len,
760
+ beg, end, 1);
761
+ } else {
762
+ VALUE rtok;
763
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
764
+ return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
765
+ RSTRING_LEN(rtok), beg, end, 1);
766
+ }
767
+ }
768
+
769
+ #else
770
+
771
+ static Token *
772
+ rets_next(TokenStream *ts)
773
+ {
774
+ static struct re_registers regs;
775
+ int ret, beg, end;
776
+ long rtext_len = RSTRING_LEN(RETS(ts)->rtext);
777
+ char *rtext_ptr = RSTRING_PTR(RETS(ts)->rtext);
778
+ Check_Type(RETS(ts)->regex, T_REGEXP);
779
+ ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
780
+ rtext_ptr, rtext_len,
781
+ RETS(ts)->curr_ind, rtext_len - RETS(ts)->curr_ind,
782
+ &regs);
783
+
784
+ if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
785
+ if (ret < 0) return NULL; /* not matched */
786
+
787
+ beg = regs.beg[0];
788
+ RETS(ts)->curr_ind = end = regs.end[0];
789
+ if (NIL_P(RETS(ts)->proc)) {
790
+ return tk_set(&(CachedTS(ts)->token), rtext_ptr + beg, end - beg,
791
+ beg, end, 1);
792
+ } else {
793
+ VALUE rtok = rb_str_new(rtext_ptr + beg, end - beg);
794
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
795
+ return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
796
+ RSTRING_LEN(rtok), beg, end, 1);
797
+ }
798
+ }
799
+
800
+ #endif
801
+
802
+ static TokenStream *
803
+ rets_reset(TokenStream *ts, char *text)
804
+ {
805
+ RETS(ts)->rtext = rb_str_new2(text);
806
+ RETS(ts)->curr_ind = 0;
807
+ return ts;
808
+ }
809
+
810
+ static TokenStream *
811
+ rets_clone_i(TokenStream *orig_ts)
812
+ {
813
+ TokenStream *ts = ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
814
+ return ts;
815
+ }
816
+
817
+ static TokenStream *
818
+ rets_new(VALUE rtext, VALUE regex, VALUE proc)
819
+ {
820
+ TokenStream *ts = ts_new(RegExpTokenStream);
821
+
822
+ if (rtext != Qnil) {
823
+ rtext = StringValue(rtext);
824
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
825
+ }
826
+ ts->reset = &rets_reset;
827
+ ts->next = &rets_next;
828
+ ts->clone_i = &rets_clone_i;
829
+ ts->destroy_i = &rets_destroy_i;
830
+
831
+ RETS(ts)->curr_ind = 0;
832
+ RETS(ts)->rtext = rtext;
833
+ RETS(ts)->proc = proc;
834
+
835
+ if (NIL_P(regex)) {
836
+ RETS(ts)->regex = rtoken_re;
837
+ } else {
838
+ Check_Type(regex, T_REGEXP);
839
+ RETS(ts)->regex = regex;
840
+ }
841
+
842
+ return ts;
843
+ }
844
+
845
+ /*
846
+ * call-seq:
847
+ * RegExpTokenizer.new(input, /[[:alpha:]]+/)
848
+ *
849
+ * Create a new tokenizer based on a regular expression
850
+ *
851
+ * input:: text to tokenizer
852
+ * regexp:: regular expression used to recognize tokens in the input
853
+ */
854
+ static VALUE
855
+ frb_rets_init(int argc, VALUE *argv, VALUE self)
856
+ {
857
+ VALUE rtext, regex, proc;
858
+ TokenStream *ts;
859
+
860
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
861
+
862
+ ts = rets_new(rtext, regex, proc);
863
+
864
+ Frt_Wrap_Struct(self, &frb_rets_mark, &frb_rets_free, ts);
865
+ object_add(ts, self);
866
+ return self;
867
+ }
868
+
869
+ /****************************************************************************
870
+ * Tokenizers
871
+ ****************************************************************************/
872
+
873
+ #define TS_ARGS(dflt) \
874
+ bool lower;\
875
+ VALUE rlower, rstr;\
876
+ rb_scan_args(argc, argv, "11", &rstr, &rlower);\
877
+ lower = (argc ? RTEST(rlower) : dflt)
878
+
879
+ /*
880
+ * call-seq:
881
+ * AsciiLetterTokenizer.new() -> tokenizer
882
+ *
883
+ * Create a new AsciiLetterTokenizer
884
+ */
885
+ static VALUE
886
+ frb_a_letter_tokenizer_init(VALUE self, VALUE rstr)
887
+ {
888
+ return get_wrapped_ts(self, rstr, letter_tokenizer_new());
889
+ }
890
+
891
+ /*
892
+ * call-seq:
893
+ * LetterTokenizer.new(lower = true) -> tokenizer
894
+ *
895
+ * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
896
+ * is done according the current locale.
897
+ *
898
+ * lower:: set to false if you don't wish to downcase tokens
899
+ */
900
+ static VALUE
901
+ frb_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
902
+ {
903
+ TS_ARGS(false);
904
+ #ifndef POSH_OS_WIN32
905
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
906
+ #endif
907
+ return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
908
+ }
909
+
910
+ /*
911
+ * call-seq:
912
+ * AsciiWhiteSpaceTokenizer.new() -> tokenizer
913
+ *
914
+ * Create a new AsciiWhiteSpaceTokenizer
915
+ */
916
+ static VALUE
917
+ frb_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
918
+ {
919
+ return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
920
+ }
921
+
922
+ /*
923
+ * call-seq:
924
+ * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
925
+ *
926
+ * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
927
+ * Downcasing is done according the current locale.
928
+ *
929
+ * lower:: set to false if you don't wish to downcase tokens
930
+ */
931
+ static VALUE
932
+ frb_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
933
+ {
934
+ TS_ARGS(false);
935
+ #ifndef POSH_OS_WIN32
936
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
937
+ #endif
938
+ return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
939
+ }
940
+
941
+ /*
942
+ * call-seq:
943
+ * AsciiStandardTokenizer.new() -> tokenizer
944
+ *
945
+ * Create a new AsciiStandardTokenizer
946
+ */
947
+ static VALUE
948
+ frb_a_standard_tokenizer_init(VALUE self, VALUE rstr)
949
+ {
950
+ return get_wrapped_ts(self, rstr, standard_tokenizer_new());
951
+ }
952
+
953
+ /*
954
+ * call-seq:
955
+ * StandardTokenizer.new(lower = true) -> tokenizer
956
+ *
957
+ * Create a new StandardTokenizer which optionally downcases tokens.
958
+ * Downcasing is done according the current locale.
959
+ *
960
+ * lower:: set to false if you don't wish to downcase tokens
961
+ */
962
+ static VALUE
963
+ frb_standard_tokenizer_init(VALUE self, VALUE rstr)
964
+ {
965
+ #ifndef POSH_OS_WIN32
966
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
967
+ #endif
968
+ return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
969
+ }
970
+
971
+ /****************************************************************************
972
+ * Filters
973
+ ****************************************************************************/
974
+
975
+
976
+ /*
977
+ * call-seq:
978
+ * AsciiLowerCaseFilter.new(token_stream) -> token_stream
979
+ *
980
+ * Create an AsciiLowerCaseFilter which normalizes a token's text to
981
+ * lowercase but only for ASCII characters. For other characters use
982
+ * LowerCaseFilter.
983
+ */
984
+ static VALUE
985
+ frb_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
986
+ {
987
+ TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
988
+ ts = lowercase_filter_new(ts);
989
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
990
+
991
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
992
+ object_add(ts, self);
993
+ return self;
994
+ }
995
+
996
+ /*
997
+ * call-seq:
998
+ * LowerCaseFilter.new(token_stream) -> token_stream
999
+ *
1000
+ * Create an LowerCaseFilter which normalizes a token's text to
1001
+ * lowercase based on the current locale.
1002
+ */
1003
+ static VALUE
1004
+ frb_lowercase_filter_init(VALUE self, VALUE rsub_ts)
1005
+ {
1006
+ TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
1007
+ #ifndef POSH_OS_WIN32
1008
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1009
+ #endif
1010
+ ts = mb_lowercase_filter_new(ts);
1011
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1012
+
1013
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1014
+ object_add(ts, self);
1015
+ return self;
1016
+ }
1017
+
1018
+ /*
1019
+ * call-seq:
1020
+ * HyphenFilter.new(token_stream) -> token_stream
1021
+ *
1022
+ * Create an HyphenFilter which filters hyphenated words. The way it works is
1023
+ * by adding both the word concatenated into a single word and split into
1024
+ * multiple words. ie "e-mail" becomes "email" and "e mail". This way a
1025
+ * search for "e-mail", "email" and "mail" will all match. This filter is
1026
+ * used by default by the StandardAnalyzer.
1027
+ */
1028
+ static VALUE
1029
+ frb_hyphen_filter_init(VALUE self, VALUE rsub_ts)
1030
+ {
1031
+ TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
1032
+ ts = hyphen_filter_new(ts);
1033
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1034
+
1035
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1036
+ object_add(ts, self);
1037
+ return self;
1038
+ }
1039
+
1040
+ /*
1041
+ * call-seq:
1042
+ * StopFilter.new(token_stream) -> token_stream
1043
+ * StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
1044
+ *
1045
+ * Create an StopFilter which removes *stop-words* from a TokenStream. You can
1046
+ * optionally specify the stopwords you wish to have removed.
1047
+ *
1048
+ * token_stream:: TokenStream to be filtered
1049
+ * stop_words:: Array of *stop-words* you wish to be filtered out. This
1050
+ * defaults to a list of English stop-words. The
1051
+ * Ferret::Analysis contains a number of stop-word lists.
1052
+ */
1053
+ static VALUE
1054
+ frb_stop_filter_init(int argc, VALUE *argv, VALUE self)
1055
+ {
1056
+ VALUE rsub_ts, rstop_words;
1057
+ TokenStream *ts;
1058
+ rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
1059
+ ts = frb_get_cwrapped_rts(rsub_ts);
1060
+ if (rstop_words != Qnil) {
1061
+ char **stop_words = get_stopwords(rstop_words);
1062
+ ts = stop_filter_new_with_words(ts, (const char **)stop_words);
1063
+
1064
+ free(stop_words);
1065
+ } else {
1066
+ ts = stop_filter_new(ts);
1067
+ }
1068
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1069
+
1070
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1071
+ object_add(ts, self);
1072
+ return self;
1073
+ }
1074
+
1075
+ static INLINE void frb_add_mapping_i(TokenStream *mf, VALUE from,
1076
+ const char *to)
1077
+ {
1078
+ switch (TYPE(from)) {
1079
+ case T_STRING:
1080
+ mapping_filter_add(mf, rs2s(from), to);
1081
+ break;
1082
+ case T_SYMBOL:
1083
+ mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
1084
+ break;
1085
+ default:
1086
+ rb_raise(rb_eArgError,
1087
+ "cannot map from %s with MappingFilter",
1088
+ rs2s(rb_obj_as_string(from)));
1089
+ break;
1090
+ }
1091
+ }
1092
+
1093
+ static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1094
+ {
1095
+ if (key == Qundef) {
1096
+ return ST_CONTINUE;
1097
+ } else {
1098
+ TokenStream *mf = (TokenStream *)arg;
1099
+ const char *to;
1100
+ switch (TYPE(value)) {
1101
+ case T_STRING:
1102
+ to = rs2s(value);
1103
+ break;
1104
+ case T_SYMBOL:
1105
+ to = rb_id2name(SYM2ID(value));
1106
+ break;
1107
+ default:
1108
+ rb_raise(rb_eArgError,
1109
+ "cannot map to %s with MappingFilter",
1110
+ rs2s(rb_obj_as_string(key)));
1111
+ break;
1112
+ }
1113
+ if (TYPE(key) == T_ARRAY) {
1114
+ int i;
1115
+ for (i = RARRAY_LEN(key) - 1; i >= 0; i--) {
1116
+ frb_add_mapping_i(mf, RARRAY_PTR(key)[i], to);
1117
+ }
1118
+ }
1119
+ else {
1120
+ frb_add_mapping_i(mf, key, to);
1121
+ }
1122
+ }
1123
+ return ST_CONTINUE;
1124
+ }
1125
+
1126
+
1127
+ /*
1128
+ * call-seq:
1129
+ * MappingFilter.new(token_stream, mapping) -> token_stream
1130
+ *
1131
+ * Create an MappingFilter which maps strings in tokens. This is usually used
1132
+ * to map UTF-8 characters to ASCII characters for easier searching and
1133
+ * better search recall. The mapping is compiled into a Deterministic Finite
1134
+ * Automata so it is super fast. This Filter can therefor be used for
1135
+ * indexing very large datasets. Currently regular expressions are not
1136
+ * supported. If you are really interested in the feature, please contact me
1137
+ * at dbalmain@gmail.com.
1138
+ *
1139
+ * token_stream:: TokenStream to be filtered
1140
+ * mapping:: Hash of mappings to apply to tokens. The key can be a
1141
+ * String or an Array of Strings. The value must be a String
1142
+ *
1143
+ * == Example
1144
+ *
1145
+ * filt = MappingFilter.new(token_stream,
1146
+ * {
1147
+ * ['à','á','â','ã','ä','å'] => 'a',
1148
+ * ['è','é','ê','ë','ē','ę'] => 'e'
1149
+ * })
1150
+ */
1151
+ static VALUE
1152
+ frb_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1153
+ {
1154
+ TokenStream *ts;
1155
+ ts = frb_get_cwrapped_rts(rsub_ts);
1156
+ ts = mapping_filter_new(ts);
1157
+ rb_hash_foreach(mapping, frb_add_mappings_i, (VALUE)ts);
1158
+ mulmap_compile(((MappingFilter *)ts)->mapper);
1159
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1160
+
1161
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1162
+ object_add(ts, self);
1163
+ return self;
1164
+ }
1165
+
1166
+ /*
1167
+ * call-seq:
1168
+ * StemFilter.new(token_stream) -> token_stream
1169
+ * StemFilter.new(token_stream,
1170
+ * algorithm="english",
1171
+ * encoding="UTF-8") -> token_stream
1172
+ *
1173
+ * Create an StemFilter which uses a snowball stemmer (thank you Martin
1174
+ * Porter) to stem words. You can optionally specify the algorithm (default:
1175
+ * "english") and encoding (default: "UTF-8").
1176
+ *
1177
+ * token_stream:: TokenStream to be filtered
1178
+ * algorithm:: The algorithm (or language) to use
1179
+ * encoding:: The encoding of the data (default: "UTF-8")
1180
+ */
1181
+ static VALUE
1182
+ frb_stem_filter_init(int argc, VALUE *argv, VALUE self)
1183
+ {
1184
+ VALUE rsub_ts, ralgorithm, rcharenc;
1185
+ char *algorithm = "english";
1186
+ char *charenc = NULL;
1187
+ TokenStream *ts;
1188
+ rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
1189
+ ts = frb_get_cwrapped_rts(rsub_ts);
1190
+ switch (argc) {
1191
+ case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
1192
+ case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
1193
+ }
1194
+ ts = stem_filter_new(ts, algorithm, charenc);
1195
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1196
+
1197
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1198
+ object_add(ts, self);
1199
+ if (((StemFilter *)ts)->stemmer == NULL) {
1200
+ rb_raise(rb_eArgError, "No stemmer could be found with the encoding "
1201
+ "%s and the language %s", charenc, algorithm);
1202
+ }
1203
+ return self;
1204
+ }
1205
+
1206
+ /****************************************************************************
1207
+ *
1208
+ * Analyzer Methods
1209
+ *
1210
+ ****************************************************************************/
1211
+
1212
+ /****************************************************************************
1213
+ * CWrappedAnalyzer Methods
1214
+ ****************************************************************************/
1215
+
1216
+ #define GET_A(a, self) Data_Get_Struct(self, Analyzer, a)
1217
+
1218
+ #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
1219
+ typedef struct CWrappedAnalyzer
1220
+ {
1221
+ Analyzer super;
1222
+ VALUE ranalyzer;
1223
+ } CWrappedAnalyzer;
1224
+
1225
+ static void
1226
+ cwa_destroy_i(Analyzer *a)
1227
+ {
1228
+ rb_hash_delete(object_space, ((VALUE)a)|1);
1229
+ /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
1230
+ free(a);
1231
+ }
1232
+
1233
+ static TokenStream *
1234
+ cwa_get_ts(Analyzer *a, Symbol field, char *text)
1235
+ {
1236
+ VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1237
+ FSYM2SYM(field), rb_str_new2(text));
1238
+ return frb_get_cwrapped_rts(rts);
1239
+ }
1240
+
1241
+ Analyzer *
1242
+ frb_get_cwrapped_analyzer(VALUE ranalyzer)
1243
+ {
1244
+ Analyzer *a = NULL;
1245
+ if (frb_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
1246
+ Data_Get_Struct(ranalyzer, Analyzer, a);
1247
+ REF(a);
1248
+ }
1249
+ else {
1250
+ a = (Analyzer *)frt_ecalloc(sizeof(CWrappedAnalyzer));
1251
+ a->destroy_i = &cwa_destroy_i;
1252
+ a->get_ts = &cwa_get_ts;
1253
+ a->ref_cnt = 1;
1254
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1255
+ /* prevent from being garbage collected */
1256
+ rb_hash_aset(object_space, ((VALUE)a)|1, ranalyzer);
1257
+ }
1258
+ return a;
1259
+ }
1260
+
1261
+ static void
1262
+ frb_analyzer_free(Analyzer *a)
1263
+ {
1264
+ object_del(a);
1265
+ a_deref(a);
1266
+ }
1267
+
1268
+ VALUE
1269
+ frb_get_analyzer(Analyzer *a)
1270
+ {
1271
+ VALUE self = Qnil;
1272
+ if (a) {
1273
+ self = object_get(a);
1274
+ if (self == Qnil) {
1275
+ self = Data_Wrap_Struct(cAnalyzer, NULL, &frb_analyzer_free, a);
1276
+ REF(a);
1277
+ object_add(a, self);
1278
+ }
1279
+ }
1280
+ return self;
1281
+ }
1282
+
1283
+ INLINE VALUE
1284
+ get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
1285
+ {
1286
+ TokenStream *ts = a_get_ts(a, frb_field(rfield), rs2s(rstring));
1287
+
1288
+ /* Make sure that there is no entry already */
1289
+ object_set(&ts->text, rstring);
1290
+ return get_rb_token_stream(ts);
1291
+ }
1292
+
1293
+ /*
1294
+ * call-seq:
1295
+ * analyzer.token_stream(field_name, input) -> token_stream
1296
+ *
1297
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1298
+ * also depend on the +field_name+. Although this parameter is typically
1299
+ * ignored.
1300
+ *
1301
+ * field_name:: name of the field to be tokenized
1302
+ * input:: data from the field to be tokenized
1303
+ */
1304
+ static VALUE
1305
+ frb_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1306
+ {
1307
+ /* NOTE: Any changes made to this method may also need to be applied to
1308
+ * frb_re_analyzer_token_stream */
1309
+ Analyzer *a;
1310
+ GET_A(a, self);
1311
+
1312
+ StringValue(rstring);
1313
+
1314
+ return get_rb_ts_from_a(a, rfield, rstring);
1315
+ }
1316
+
1317
+ #define GET_LOWER(dflt) \
1318
+ bool lower;\
1319
+ VALUE rlower;\
1320
+ rb_scan_args(argc, argv, "01", &rlower);\
1321
+ lower = (argc ? RTEST(rlower) : dflt)
1322
+
1323
+ /*
1324
+ * call-seq:
1325
+ * AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
1326
+ *
1327
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1328
+ * but can optionally leave case as is. Lowercasing will only be done to
1329
+ * ASCII characters.
1330
+ *
1331
+ * lower:: set to false if you don't want the field's tokens to be downcased
1332
+ */
1333
+ static VALUE
1334
+ frb_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1335
+ {
1336
+ Analyzer *a;
1337
+ GET_LOWER(false);
1338
+ a = whitespace_analyzer_new(lower);
1339
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1340
+ object_add(a, self);
1341
+ return self;
1342
+ }
1343
+
1344
+ /*
1345
+ * call-seq:
1346
+ * WhiteSpaceAnalyzer.new(lower = false) -> analyzer
1347
+ *
1348
+ * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1349
+ * optionally leave case as is. Lowercasing will be done based on the current
1350
+ * locale.
1351
+ *
1352
+ * lower:: set to false if you don't want the field's tokens to be downcased
1353
+ */
1354
+ static VALUE
1355
+ frb_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1356
+ {
1357
+ Analyzer *a;
1358
+ GET_LOWER(false);
1359
+ #ifndef POSH_OS_WIN32
1360
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1361
+ #endif
1362
+ a = mb_whitespace_analyzer_new(lower);
1363
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1364
+ object_add(a, self);
1365
+ return self;
1366
+ }
1367
+
1368
+ /*
1369
+ * call-seq:
1370
+ * AsciiLetterAnalyzer.new(lower = true) -> analyzer
1371
+ *
1372
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1373
+ * but can optionally leave case as is. Lowercasing will only be done to
1374
+ * ASCII characters.
1375
+ *
1376
+ * lower:: set to false if you don't want the field's tokens to be downcased
1377
+ */
1378
+ static VALUE
1379
+ frb_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1380
+ {
1381
+ Analyzer *a;
1382
+ GET_LOWER(true);
1383
+ a = letter_analyzer_new(lower);
1384
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1385
+ object_add(a, self);
1386
+ return self;
1387
+ }
1388
+
1389
+ /*
1390
+ * call-seq:
1391
+ * LetterAnalyzer.new(lower = true) -> analyzer
1392
+ *
1393
+ * Create a new LetterAnalyzer which downcases tokens by default but can
1394
+ * optionally leave case as is. Lowercasing will be done based on the current
1395
+ * locale.
1396
+ *
1397
+ * lower:: set to false if you don't want the field's tokens to be downcased
1398
+ */
1399
+ static VALUE
1400
+ frb_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1401
+ {
1402
+ Analyzer *a;
1403
+ GET_LOWER(true);
1404
+ #ifndef POSH_OS_WIN32
1405
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1406
+ #endif
1407
+ a = mb_letter_analyzer_new(lower);
1408
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1409
+ object_add(a, self);
1410
+ return self;
1411
+ }
1412
+
1413
+ static VALUE
1414
+ get_rstopwords(const char **stop_words)
1415
+ {
1416
+ char **w = (char **)stop_words;
1417
+ VALUE rstopwords = rb_ary_new();
1418
+
1419
+ while (*w) {
1420
+ rb_ary_push(rstopwords, rb_str_new2(*w));
1421
+ w++;
1422
+ }
1423
+ return rstopwords;
1424
+ }
1425
+
1426
+ /*
1427
+ * call-seq:
1428
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
1429
+ * -> analyzer
1430
+ *
1431
+ * Create a new AsciiStandardAnalyzer which downcases tokens by default but
1432
+ * can optionally leave case as is. Lowercasing will be done based on the
1433
+ * current locale. You can also set the list of stop-words to be used by the
1434
+ * StopFilter.
1435
+ *
1436
+ * lower:: set to false if you don't want the field's tokens to be downcased
1437
+ * stop_words:: list of stop-words to pass to the StopFilter
1438
+ */
1439
+ static VALUE
1440
+ frb_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1441
+ {
1442
+ bool lower;
1443
+ VALUE rlower, rstop_words;
1444
+ Analyzer *a;
1445
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1446
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1447
+ if (rstop_words != Qnil) {
1448
+ char **stop_words = get_stopwords(rstop_words);
1449
+ a = standard_analyzer_new_with_words((const char **)stop_words, lower);
1450
+ free(stop_words);
1451
+ } else {
1452
+ a = standard_analyzer_new(lower);
1453
+ }
1454
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1455
+ object_add(a, self);
1456
+ return self;
1457
+ }
1458
+
1459
+ /*
1460
+ * call-seq:
1461
+ * StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
1462
+ * -> analyzer
1463
+ *
1464
+ * Create a new StandardAnalyzer which downcases tokens by default but can
1465
+ * optionally leave case as is. Lowercasing will be done based on the current
1466
+ * locale. You can also set the list of stop-words to be used by the
1467
+ * StopFilter.
1468
+ *
1469
+ * lower:: set to false if you don't want the field's tokens to be downcased
1470
+ * stop_words:: list of stop-words to pass to the StopFilter
1471
+ */
1472
+ static VALUE
1473
+ frb_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1474
+ {
1475
+ bool lower;
1476
+ VALUE rlower, rstop_words;
1477
+ Analyzer *a;
1478
+ #ifndef POSH_OS_WIN32
1479
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1480
+ #endif
1481
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1482
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1483
+ if (rstop_words != Qnil) {
1484
+ char **stop_words = get_stopwords(rstop_words);
1485
+ a = mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
1486
+ free(stop_words);
1487
+ } else {
1488
+ a = mb_standard_analyzer_new(lower);
1489
+ }
1490
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1491
+ object_add(a, self);
1492
+ return self;
1493
+ }
1494
+
1495
+ static void
1496
+ frb_h_mark_values_i(void *key, void *value, void *arg)
1497
+ {
1498
+ frb_gc_mark(value);
1499
+ }
1500
+
1501
+ static void
1502
+ frb_pfa_mark(void *p)
1503
+ {
1504
+ frb_gc_mark(PFA(p)->default_a);
1505
+ h_each(PFA(p)->dict, &frb_h_mark_values_i, NULL);
1506
+ }
1507
+
1508
+ /*** PerFieldAnalyzer ***/
1509
+
1510
+ /*
1511
+ * call-seq:
1512
+ * PerFieldAnalyzer.new(default_analyzer) -> analyzer
1513
+ *
1514
+ * Create a new PerFieldAnalyzer specifying the default analyzer to use on
1515
+ * all fields that are set specifically.
1516
+ *
1517
+ * default_analyzer:: analyzer to be used on fields that aren't otherwise
1518
+ * specified
1519
+ */
1520
+ static VALUE
1521
+ frb_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1522
+ {
1523
+ Analyzer *def = frb_get_cwrapped_analyzer(ranalyzer);
1524
+ Analyzer *a = per_field_analyzer_new(def);
1525
+ Frt_Wrap_Struct(self, &frb_pfa_mark, &frb_analyzer_free, a);
1526
+ object_add(a, self);
1527
+ return self;
1528
+ }
1529
+
1530
+ /*
1531
+ * call-seq:
1532
+ * per_field_analyzer.add_field(field_name, default_analyzer) -> self
1533
+ * per_field_analyzer[field_name] = default_analyzer -> self
1534
+ *
1535
+ * Set the analyzer to be used on field +field_name+. Note that field_name
1536
+ * should be a symbol.
1537
+ *
1538
+ * field_name:: field we wish to set the analyzer for
1539
+ * analyzer:: analyzer to be used on +field_name+
1540
+ */
1541
+ static VALUE
1542
+ frb_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1543
+ {
1544
+ Analyzer *pfa, *a;
1545
+ Data_Get_Struct(self, Analyzer, pfa);
1546
+ a = frb_get_cwrapped_analyzer(ranalyzer);
1547
+
1548
+ pfa_add_field(pfa, frb_field(rfield), a);
1549
+ return self;
1550
+ }
1551
+
1552
+ /*
1553
+ * call-seq:
1554
+ * analyzer.token_stream(field_name, input) -> token_stream
1555
+ *
1556
+ * Create a new TokenStream to tokenize +input+. The TokenStream created will
1557
+ * also depend on the +field_name+ in the case of the PerFieldAnalyzer.
1558
+ *
1559
+ * field_name:: name of the field to be tokenized
1560
+ * input:: data from the field to be tokenized
1561
+ */
1562
+ static VALUE
1563
+ frb_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1564
+ {
1565
+ Analyzer *pfa, *a;
1566
+ Symbol field = frb_field(rfield);
1567
+ GET_A(pfa, self);
1568
+
1569
+ StringValue(rstring);
1570
+ a = (Analyzer *)h_get(PFA(pfa)->dict, field);
1571
+ if (a == NULL) {
1572
+ a = PFA(pfa)->default_a;
1573
+ }
1574
+ if (a->get_ts == cwa_get_ts) {
1575
+ return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1576
+ FSYM2SYM(field), rb_str_new2(rs2s(rstring)));
1577
+ }
1578
+ else {
1579
+ return get_rb_ts_from_a(a, rfield, rstring);
1580
+ }
1581
+ }
1582
+
1583
+ /*** RegExpAnalyzer ***/
1584
+
1585
+ static void
1586
+ frb_re_analyzer_mark(Analyzer *a)
1587
+ {
1588
+ frb_gc_mark(a->current_ts);
1589
+ }
1590
+
1591
+ static void
1592
+ re_analyzer_destroy_i(Analyzer *a)
1593
+ {
1594
+ ts_deref(a->current_ts);
1595
+ free(a);
1596
+ }
1597
+
1598
+ /*
1599
+ * call-seq:
1600
+ * RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
1601
+ *
1602
+ * Create a new RegExpAnalyzer which will create tokenizers based on the
1603
+ * regular expression and lowercasing if required.
1604
+ *
1605
+ * reg_exp:: the token matcher for the tokenizer to use
1606
+ * lower:: set to false if you don't want to downcase the tokens
1607
+ */
1608
+ static VALUE
1609
+ frb_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1610
+ {
1611
+ VALUE lower, rets, regex, proc;
1612
+ Analyzer *a;
1613
+ TokenStream *ts;
1614
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1615
+
1616
+ ts = rets_new(Qnil, regex, proc);
1617
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frb_rets_mark, &frb_rets_free, ts);
1618
+ object_add(ts, rets);
1619
+
1620
+ if (lower != Qfalse) {
1621
+ rets = frb_lowercase_filter_init(frb_data_alloc(cLowerCaseFilter), rets);
1622
+ ts = DATA_PTR(rets);
1623
+ }
1624
+ REF(ts);
1625
+
1626
+ a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1627
+ Frt_Wrap_Struct(self, &frb_re_analyzer_mark, &frb_analyzer_free, a);
1628
+ object_add(a, self);
1629
+ return self;
1630
+ }
1631
+
1632
+ /*
1633
+ * call-seq:
1634
+ * analyzer.token_stream(field_name, input) -> token_stream
1635
+ *
1636
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1637
+ * also depend on the +field_name+. Although this parameter is typically
1638
+ * ignored.
1639
+ *
1640
+ * field_name:: name of the field to be tokenized
1641
+ * input:: data from the field to be tokenized
1642
+ */
1643
+ static VALUE
1644
+ frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1645
+ {
1646
+ TokenStream *ts;
1647
+ Analyzer *a;
1648
+ GET_A(a, self);
1649
+
1650
+ StringValue(rtext);
1651
+
1652
+ ts = a_get_ts(a, frb_field(rfield), rs2s(rtext));
1653
+
1654
+ /* Make sure that there is no entry already */
1655
+ object_set(&ts->text, rtext);
1656
+ if (ts->next == &rets_next) {
1657
+ RETS(ts)->rtext = rtext;
1658
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
1659
+ }
1660
+ else {
1661
+ RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
1662
+ rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext);
1663
+ }
1664
+ return get_rb_token_stream(ts);
1665
+ }
1666
+
1667
+ /****************************************************************************
1668
+ *
1669
+ * Locale stuff
1670
+ *
1671
+ ****************************************************************************/
1672
+
1673
+ /*
1674
+ * call-seq:
1675
+ * Ferret.locale -> locale_str
1676
+ *
1677
+ * Returns a string corresponding to the locale set. For example;
1678
+ *
1679
+ * puts Ferret.locale #=> "en_US.UTF-8"
1680
+ */
1681
+ static VALUE frb_get_locale(VALUE self, VALUE locale)
1682
+ {
1683
+ return (frb_locale ? rb_str_new2(frb_locale) : Qnil);
1684
+ }
1685
+
1686
+ /*
1687
+ * call-seq:
1688
+ * Ferret.locale = "en_US.UTF-8"
1689
+ *
1690
+ * Set the global locale. You should use this method to set different locales
1691
+ * when indexing documents with different encodings.
1692
+ */
1693
+ static VALUE frb_set_locale(VALUE self, VALUE locale)
1694
+ {
1695
+ char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
1696
+ frb_locale = setlocale(LC_CTYPE, l);
1697
+ return frb_locale ? rb_str_new2(frb_locale) : Qnil;
1698
+ }
1699
+
1700
+ /****************************************************************************
1701
+ *
1702
+ * Init Functions
1703
+ *
1704
+ ****************************************************************************/
1705
+
1706
+ /*
1707
+ * Document-class: Ferret::Analysis::Token
1708
+ *
1709
+ * == Summary
1710
+ *
1711
+ * A Token is an occurrence of a term from the text of a field. It consists
1712
+ * of a term's text and the start and end offset of the term in the text of
1713
+ * the field;
1714
+ *
1715
+ * The start and end offsets permit applications to re-associate a token with
1716
+ * its source text, e.g., to display highlighted query terms in a document
1717
+ * browser, or to show matching text fragments in a KWIC (KeyWord In Context)
1718
+ * display, etc.
1719
+ *
1720
+ * === Attributes
1721
+ *
1722
+ * text:: the terms text which may have been modified by a Token Filter or
1723
+ * Tokenizer from the text originally found in the document
1724
+ * start:: is the position of the first character corresponding to
1725
+ * this token in the source text
1726
+ * end:: is equal to one greater than the position of the last
1727
+ * character corresponding of this token Note that the
1728
+ * difference between @end_offset and @start_offset may not be
1729
+ * equal to @text.length(), as the term text may have been
1730
+ * altered by a stemmer or some other filter.
1731
+ */
1732
+ static void Init_Token(void)
1733
+ {
1734
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1735
+ rb_define_alloc_func(cToken, frb_token_alloc);
1736
+ rb_include_module(cToken, rb_mComparable);
1737
+
1738
+ rb_define_method(cToken, "initialize", frb_token_init, -1);
1739
+ rb_define_method(cToken, "<=>", frb_token_cmp, 1);
1740
+ rb_define_method(cToken, "text", frb_token_get_text, 0);
1741
+ rb_define_method(cToken, "text=", frb_token_set_text, 1);
1742
+ rb_define_method(cToken, "start", frb_token_get_start_offset, 0);
1743
+ rb_define_method(cToken, "start=", frb_token_set_start_offset, 1);
1744
+ rb_define_method(cToken, "end", frb_token_get_end_offset, 0);
1745
+ rb_define_method(cToken, "end=", frb_token_set_end_offset, 1);
1746
+ rb_define_method(cToken, "pos_inc", frb_token_get_pos_inc, 0);
1747
+ rb_define_method(cToken, "pos_inc=", frb_token_set_pos_inc, 1);
1748
+ rb_define_method(cToken, "to_s", frb_token_to_s, 0);
1749
+ }
1750
+
1751
+ /*
1752
+ * Document-class: Ferret::Analysis::TokenStream
1753
+ *
1754
+ * == Summary
1755
+ *
1756
+ * A TokenStream enumerates the sequence of tokens, either from
1757
+ * fields of a document or from query text.
1758
+ *
1759
+ * This is an abstract class. Concrete subclasses are:
1760
+ *
1761
+ * Tokenizer:: a TokenStream whose input is a string
1762
+ * TokenFilter:: a TokenStream whose input is another TokenStream
1763
+ */
1764
+ static void Init_TokenStream(void)
1765
+ {
1766
+ cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1767
+ frb_mark_cclass(cTokenStream);
1768
+ rb_define_method(cTokenStream, "next", frb_ts_next, 0);
1769
+ rb_define_method(cTokenStream, "text=", frb_ts_set_text, 1);
1770
+ rb_define_method(cTokenStream, "text", frb_ts_get_text, 0);
1771
+ }
1772
+
1773
+ /*
1774
+ * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1775
+ *
1776
+ * == Summary
1777
+ *
1778
+ * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1779
+ * That is to say, it defines tokens as maximal strings of adjacent letters,
1780
+ * as defined by the regular expression _/[A-Za-z]+/_.
1781
+ *
1782
+ * === Example
1783
+ *
1784
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1785
+ * => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
1786
+ */
1787
+ static void Init_AsciiLetterTokenizer(void)
1788
+ {
1789
+ cAsciiLetterTokenizer =
1790
+ rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1791
+ frb_mark_cclass(cAsciiLetterTokenizer);
1792
+ rb_define_alloc_func(cAsciiLetterTokenizer, frb_data_alloc);
1793
+ rb_define_method(cAsciiLetterTokenizer, "initialize",
1794
+ frb_a_letter_tokenizer_init, 1);
1795
+ }
1796
+
1797
+ /*
1798
+ * Document-class: Ferret::Analysis::LetterTokenizer
1799
+ *
1800
+ * == Summary
1801
+ *
1802
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That is
1803
+ * to say, it defines tokens as maximal strings of adjacent letters, as
1804
+ * defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
1805
+ * all characters in your local locale.
1806
+ *
1807
+ * === Example
1808
+ *
1809
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1810
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1811
+ */
1812
+ static void Init_LetterTokenizer(void)
1813
+ {
1814
+ cLetterTokenizer =
1815
+ rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1816
+ frb_mark_cclass(cLetterTokenizer);
1817
+ rb_define_alloc_func(cLetterTokenizer, frb_data_alloc);
1818
+ rb_define_method(cLetterTokenizer, "initialize",
1819
+ frb_letter_tokenizer_init, -1);
1820
+ }
1821
+
1822
+ /*
1823
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1824
+ *
1825
+ * == Summary
1826
+ *
1827
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1828
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1829
+ *
1830
+ * === Example
1831
+ *
1832
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1833
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1834
+ */
1835
+ static void Init_AsciiWhiteSpaceTokenizer(void)
1836
+ {
1837
+ cAsciiWhiteSpaceTokenizer =
1838
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1839
+ cTokenStream);
1840
+ frb_mark_cclass(cAsciiWhiteSpaceTokenizer);
1841
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frb_data_alloc);
1842
+ rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1843
+ frb_a_whitespace_tokenizer_init, 1);
1844
+ }
1845
+
1846
+ /*
1847
+ * Document-class: Ferret::Analysis::WhiteSpaceTokenizer
1848
+ *
1849
+ * == Summary
1850
+ *
1851
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1852
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1853
+ *
1854
+ * === Example
1855
+ *
1856
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1857
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1858
+ */
1859
+ static void Init_WhiteSpaceTokenizer(void)
1860
+ {
1861
+ cWhiteSpaceTokenizer =
1862
+ rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1863
+ frb_mark_cclass(cWhiteSpaceTokenizer);
1864
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frb_data_alloc);
1865
+ rb_define_method(cWhiteSpaceTokenizer, "initialize",
1866
+ frb_whitespace_tokenizer_init, -1);
1867
+ }
1868
+
1869
+ /*
1870
+ * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1871
+ *
1872
+ * == Summary
1873
+ *
1874
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1875
+ * words correctly as well as tokenizing things like email addresses, web
1876
+ * addresses, phone numbers, etc.
1877
+ *
1878
+ * === Example
1879
+ *
1880
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1881
+ * => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
1882
+ */
1883
+ static void Init_AsciiStandardTokenizer(void)
1884
+ {
1885
+ cAsciiStandardTokenizer =
1886
+ rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1887
+ frb_mark_cclass(cAsciiStandardTokenizer);
1888
+ rb_define_alloc_func(cAsciiStandardTokenizer, frb_data_alloc);
1889
+ rb_define_method(cAsciiStandardTokenizer, "initialize",
1890
+ frb_a_standard_tokenizer_init, 1);
1891
+ }
1892
+
1893
+ /*
1894
+ * Document-class: Ferret::Analysis::StandardTokenizer
1895
+ *
1896
+ * == Summary
1897
+ *
1898
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1899
+ * words correctly as well as tokenizing things like email addresses, web
1900
+ * addresses, phone numbers, etc.
1901
+ *
1902
+ * === Example
1903
+ *
1904
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1905
+ * => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
1906
+ */
1907
+ static void Init_StandardTokenizer(void)
1908
+ {
1909
+ cStandardTokenizer =
1910
+ rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1911
+ frb_mark_cclass(cStandardTokenizer);
1912
+ rb_define_alloc_func(cStandardTokenizer, frb_data_alloc);
1913
+ rb_define_method(cStandardTokenizer, "initialize",
1914
+ frb_standard_tokenizer_init, 1);
1915
+ }
1916
+
1917
+ /*
1918
+ * Document-class: Ferret::Analysis::RegExpTokenizer
1919
+ *
1920
+ * == Summary
1921
+ *
1922
+ * A tokenizer that recognizes tokens based on a regular expression passed to
1923
+ * the constructor. Most possible tokenizers can be created using this class.
1924
+ *
1925
+ * === Example
1926
+ *
1927
+ * Below is an example of a simple implementation of a LetterTokenizer using
1928
+ * an RegExpTokenizer. Basically, a token is a sequence of alphabetic
1929
+ * characters separated by one or more non-alphabetic characters.
1930
+ *
1931
+ * # of course you would add more than just é
1932
+ * RegExpTokenizer.new(input, /[[:alpha:]é]+/)
1933
+ *
1934
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1935
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1936
+ */
1937
+ static void Init_RegExpTokenizer(void)
1938
+ {
1939
+ cRegExpTokenizer =
1940
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1941
+ frb_mark_cclass(cRegExpTokenizer);
1942
+ rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1943
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1944
+ rb_define_alloc_func(cRegExpTokenizer, frb_data_alloc);
1945
+ rb_define_method(cRegExpTokenizer, "initialize",
1946
+ frb_rets_init, -1);
1947
+ rb_define_method(cRegExpTokenizer, "text=", frb_rets_set_text, 1);
1948
+ rb_define_method(cRegExpTokenizer, "text", frb_rets_get_text, 0);
1949
+ }
1950
+
1951
+ /***************/
1952
+ /*** Filters ***/
1953
+ /***************/
1954
+
1955
+ /*
1956
+ * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1957
+ *
1958
+ * == Summary
1959
+ *
1960
+ * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1961
+ * ASCII characters. For other characters use LowerCaseFilter.
1962
+ *
1963
+ * === Example
1964
+ *
1965
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
1966
+ *
1967
+ */
1968
+ static void Init_AsciiLowerCaseFilter(void)
1969
+ {
1970
+ cAsciiLowerCaseFilter =
1971
+ rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1972
+ frb_mark_cclass(cAsciiLowerCaseFilter);
1973
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frb_data_alloc);
1974
+ rb_define_method(cAsciiLowerCaseFilter, "initialize",
1975
+ frb_a_lowercase_filter_init, 1);
1976
+ }
1977
+
1978
+ /*
1979
+ * Document-class: Ferret::Analysis::LowerCaseFilter
1980
+ *
1981
+ * == Summary
1982
+ *
1983
+ * LowerCaseFilter normalizes a token's text to lowercase based on the
1984
+ * current locale.
1985
+ *
1986
+ * === Example
1987
+ *
1988
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
1989
+ *
1990
+ */
1991
+ static void Init_LowerCaseFilter(void)
1992
+ {
1993
+ cLowerCaseFilter =
1994
+ rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1995
+ frb_mark_cclass(cLowerCaseFilter);
1996
+ rb_define_alloc_func(cLowerCaseFilter, frb_data_alloc);
1997
+ rb_define_method(cLowerCaseFilter, "initialize",
1998
+ frb_lowercase_filter_init, 1);
1999
+ }
2000
+
2001
+ /*
2002
+ * Document-class: Ferret::Analysis::HyphenFilter
2003
+ *
2004
+ * == Summary
2005
+ *
2006
+ * HyphenFilter filters hyphenated words by adding both the word concatenated
2007
+ * into a single word and split into multiple words. ie "e-mail" becomes
2008
+ * "email" and "e mail". This way a search for "e-mail", "email" and "mail"
2009
+ * will all match. This filter is used by default by the StandardAnalyzer.
2010
+ *
2011
+ * === Example
2012
+ *
2013
+ * ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
2014
+ *
2015
+ */
2016
+ static void Init_HyphenFilter(void)
2017
+ {
2018
+ cHyphenFilter =
2019
+ rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
2020
+ frb_mark_cclass(cHyphenFilter);
2021
+ rb_define_alloc_func(cHyphenFilter, frb_data_alloc);
2022
+ rb_define_method(cHyphenFilter, "initialize", frb_hyphen_filter_init, 1);
2023
+ }
2024
+
2025
+ /*
2026
+ * Document-class: Ferret::Analysis::MappingFilter
2027
+ *
2028
+ * == Summary
2029
+ *
2030
+ * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
2031
+ * characters to ASCII characters for easier searching and better search
2032
+ * recall. The mapping is compiled into a Deterministic Finite Automata so it
2033
+ * is super fast. This Filter can therefor be used for indexing very large
2034
+ * datasets. Currently regular expressions are not supported. If you are
2035
+ * really interested in the feature, please contact me at dbalmain@gmail.com.
2036
+ *
2037
+ * == Example
2038
+ *
2039
+ * mapping = {
2040
+ * ['à','á','â','ã','ä','å','ā','ă'] => 'a',
2041
+ * 'æ' => 'ae',
2042
+ * ['ď','đ'] => 'd',
2043
+ * ['ç','ć','č','ĉ','ċ'] => 'c',
2044
+ * ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
2045
+ * ['ƒ'] => 'f',
2046
+ * ['ĝ','ğ','ġ','ģ'] => 'g',
2047
+ * ['ĥ','ħ'] => 'h',
2048
+ * ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
2049
+ * ['į','ı','ij','ĵ'] => 'j',
2050
+ * ['ķ','ĸ'] => 'k',
2051
+ * ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
2052
+ * ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
2053
+ * ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
2054
+ * ['œ'] => 'oek',
2055
+ * ['ą'] => 'q',
2056
+ * ['ŕ','ř','ŗ'] => 'r',
2057
+ * ['ś','š','ş','ŝ','ș'] => 's',
2058
+ * ['ť','ţ','ŧ','ț'] => 't',
2059
+ * ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
2060
+ * ['ŵ'] => 'w',
2061
+ * ['ý','ÿ','ŷ'] => 'y',
2062
+ * ['ž','ż','ź'] => 'z'
2063
+ * }
2064
+ * filt = MappingFilter.new(token_stream, mapping)
2065
+ */
2066
+ static void Init_MappingFilter(void)
2067
+ {
2068
+ cMappingFilter =
2069
+ rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
2070
+ frb_mark_cclass(cMappingFilter);
2071
+ rb_define_alloc_func(cMappingFilter, frb_data_alloc);
2072
+ rb_define_method(cMappingFilter, "initialize",
2073
+ frb_mapping_filter_init, 2);
2074
+ }
2075
+
2076
+ /*
2077
+ * Document-class: Ferret::Analysis::StopFilter
2078
+ *
2079
+ * == Summary
2080
+ *
2081
+ * A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
2082
+ * that you don't wish to be index. Usually they will be common words like
2083
+ * "the" and "and" although you can specify whichever words you want.
2084
+ *
2085
+ * === Example
2086
+ *
2087
+ * ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
2088
+ */
2089
+ static void Init_StopFilter(void)
2090
+ {
2091
+ cStopFilter =
2092
+ rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
2093
+ frb_mark_cclass(cStopFilter);
2094
+ rb_define_alloc_func(cStopFilter, frb_data_alloc);
2095
+ rb_define_method(cStopFilter, "initialize",
2096
+ frb_stop_filter_init, -1);
2097
+ }
2098
+
2099
+ /*
2100
+ * Document-class: Ferret::Analysis::StemFilter
2101
+ *
2102
+ * == Summary
2103
+ *
2104
+ * A StemFilter takes a term and transforms the term as per the SnowBall
2105
+ * stemming algorithm. Note: the input to the stemming filter must already
2106
+ * be in lower case, so you will need to use LowerCaseFilter or lowercasing
2107
+ * Tokenizer further down the Tokenizer chain in order for this to work
2108
+ * properly!
2109
+ *
2110
+ * === Available algorithms and encodings
2111
+ *
2112
+ * Algorithm Algorithm Pseudonyms Encoding
2113
+ * ----------------------------------------------------------------
2114
+ * "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
2115
+ * "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
2116
+ * "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
2117
+ * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
2118
+ * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
2119
+ * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
2120
+ * "hungarian", | "hu", "hun" | "ISO_8859_1", "UTF_8"
2121
+ * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
2122
+ * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
2123
+ * "porter", | | "ISO_8859_1", "UTF_8"
2124
+ * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
2125
+ * "romanian", | "ro", "ron", "rum" | "ISO_8859_2", "UTF_8"
2126
+ * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
2127
+ * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
2128
+ * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
2129
+ * "turkish", | "tr", "tur" | "UTF_8"
2130
+ *
2131
+ *
2132
+ * === New Stemmers
2133
+ *
2134
+ * The following stemmers have recently benn added. Please try them out;
2135
+ *
2136
+ * * Hungarian
2137
+ * * Romanian
2138
+ * * Turkish
2139
+ *
2140
+ * === Example
2141
+ *
2142
+ * To use this filter with other analyzers, you'll want to write an Analyzer
2143
+ * class that sets up the TokenStream chain as you want it. To use this with
2144
+ * a lowercasing Tokenizer, for example, you'd write an analyzer like this:
2145
+ *
2146
+ * def MyAnalyzer < Analyzer
2147
+ * def token_stream(field, str)
2148
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2149
+ * end
2150
+ * end
2151
+ *
2152
+ * "debate debates debated debating debater"
2153
+ * => ["debat", "debat", "debat", "debat", "debat"]
2154
+ *
2155
+ * === Attributes
2156
+ *
2157
+ * token_stream:: TokenStream to be filtered
2158
+ * algorithm:: The algorithm (or language) to use (default: "english")
2159
+ * encoding:: The encoding of the data (default: "UTF-8")
2160
+ */
2161
+ static void Init_StemFilter(void)
2162
+ {
2163
+ cStemFilter =
2164
+ rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
2165
+ frb_mark_cclass(cStemFilter);
2166
+ rb_define_alloc_func(cStemFilter, frb_data_alloc);
2167
+ rb_define_method(cStemFilter, "initialize",
2168
+ frb_stem_filter_init, -1);
2169
+ }
2170
+
2171
+ /*************************/
2172
+ /*** * * Analyzers * * ***/
2173
+ /*************************/
2174
+
2175
+ /*
2176
+ * Document-class: Ferret::Analysis::Analyzer
2177
+ *
2178
+ * == Summary
2179
+ *
2180
+ * An Analyzer builds TokenStreams, which analyze text. It thus represents
2181
+ * a policy for extracting index terms from text.
2182
+ *
2183
+ * Typical implementations first build a Tokenizer, which breaks the stream
2184
+ * of characters from the Reader into raw Tokens. One or more TokenFilters
2185
+ * may then be applied to the output of the Tokenizer.
2186
+ *
2187
+ * The default Analyzer just creates a LowerCaseTokenizer which converts
2188
+ * all text to lowercase tokens. See LowerCaseTokenizer for more details.
2189
+ *
2190
+ * === Example
2191
+ *
2192
+ * To create your own custom Analyzer you simply need to implement a
2193
+ * token_stream method which takes the field name and the data to be
2194
+ * tokenized as parameters and returns a TokenStream. Most analyzers
2195
+ * typically ignore the field name.
2196
+ *
2197
+ * Here we'll create a StemmingAnalyzer;
2198
+ *
2199
+ * def MyAnalyzer < Analyzer
2200
+ * def token_stream(field, str)
2201
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2202
+ * end
2203
+ * end
2204
+ */
2205
+ static void Init_Analyzer(void)
2206
+ {
2207
+ cAnalyzer =
2208
+ rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
2209
+ frb_mark_cclass(cAnalyzer);
2210
+ rb_define_alloc_func(cAnalyzer, frb_data_alloc);
2211
+ rb_define_method(cAnalyzer, "initialize", frb_letter_analyzer_init, -1);
2212
+ rb_define_method(cAnalyzer, "token_stream", frb_analyzer_token_stream, 2);
2213
+ }
2214
+
2215
+ /*
2216
+ * Document-class: Ferret::Analysis::AsciiLetterAnalyzer
2217
+ *
2218
+ * == Summary
2219
+ *
2220
+ * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
2221
+ * maximal strings of ASCII characters. If implemented in Ruby it would look
2222
+ * like;
2223
+ *
2224
+ * class AsciiLetterAnalyzer
2225
+ * def initialize(lower = true)
2226
+ * @lower = lower
2227
+ * end
2228
+ *
2229
+ * def token_stream(field, str)
2230
+ * if @lower
2231
+ * return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
2232
+ * else
2233
+ * return AsciiLetterTokenizer.new(str)
2234
+ * end
2235
+ * end
2236
+ * end
2237
+ *
2238
+ * As you can see it makes use of the AsciiLetterTokenizer and
2239
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
2240
+ * characters so you should use the LetterAnalyzer is you want to analyze
2241
+ * multi-byte data like "UTF-8".
2242
+ */
2243
+ static void Init_AsciiLetterAnalyzer(void)
2244
+ {
2245
+ cAsciiLetterAnalyzer =
2246
+ rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
2247
+ frb_mark_cclass(cAsciiLetterAnalyzer);
2248
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frb_data_alloc);
2249
+ rb_define_method(cAsciiLetterAnalyzer, "initialize",
2250
+ frb_a_letter_analyzer_init, -1);
2251
+ }
2252
+
2253
+ /*
2254
+ * Document-class: Ferret::Analysis::LetterAnalyzer
2255
+ *
2256
+ * == Summary
2257
+ *
2258
+ * A LetterAnalyzer creates a TokenStream that splits the input up into
2259
+ * maximal strings of characters as recognized by the current locale. If
2260
+ * implemented in Ruby it would look like;
2261
+ *
2262
+ * class LetterAnalyzer
2263
+ * def initialize(lower = true)
2264
+ * @lower = lower
2265
+ * end
2266
+ *
2267
+ * def token_stream(field, str)
2268
+ * return LetterTokenizer.new(str, @lower)
2269
+ * end
2270
+ * end
2271
+ *
2272
+ * As you can see it makes use of the LetterTokenizer.
2273
+ */
2274
+ static void Init_LetterAnalyzer(void)
2275
+ {
2276
+ cLetterAnalyzer =
2277
+ rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
2278
+ frb_mark_cclass(cLetterAnalyzer);
2279
+ rb_define_alloc_func(cLetterAnalyzer, frb_data_alloc);
2280
+ rb_define_method(cLetterAnalyzer, "initialize",
2281
+ frb_letter_analyzer_init, -1);
2282
+ }
2283
+
2284
+ /*
2285
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
2286
+ *
2287
+ * == Summary
2288
+ *
2289
+ * The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
2290
+ * non-whitespace characters. If implemented in Ruby the
2291
+ * AsciiWhiteSpaceAnalyzer would look like;
2292
+ *
2293
+ * class AsciiWhiteSpaceAnalyzer
2294
+ * def initialize(lower = true)
2295
+ * @lower = lower
2296
+ * end
2297
+ *
2298
+ * def token_stream(field, str)
2299
+ * if @lower
2300
+ * return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
2301
+ * else
2302
+ * return AsciiWhiteSpaceTokenizer.new(str)
2303
+ * end
2304
+ * end
2305
+ * end
2306
+ *
2307
+ * As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
2308
+ * use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
2309
+ * as "UTF-8".
2310
+ */
2311
+ static void Init_AsciiWhiteSpaceAnalyzer(void)
2312
+ {
2313
+ cAsciiWhiteSpaceAnalyzer =
2314
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
2315
+ frb_mark_cclass(cAsciiWhiteSpaceAnalyzer);
2316
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frb_data_alloc);
2317
+ rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
2318
+ frb_a_white_space_analyzer_init, -1);
2319
+ }
2320
+
2321
+ /*
2322
+ * Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
2323
+ *
2324
+ * == Summary
2325
+ *
2326
+ * The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
2327
+ * non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
2328
+ * would look like;
2329
+ *
2330
+ * class WhiteSpaceAnalyzer
2331
+ * def initialize(lower = true)
2332
+ * @lower = lower
2333
+ * end
2334
+ *
2335
+ * def token_stream(field, str)
2336
+ * return WhiteSpaceTokenizer.new(str, @lower)
2337
+ * end
2338
+ * end
2339
+ *
2340
+ * As you can see it makes use of the WhiteSpaceTokenizer.
2341
+ */
2342
+ static void Init_WhiteSpaceAnalyzer(void)
2343
+ {
2344
+ cWhiteSpaceAnalyzer =
2345
+ rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
2346
+ frb_mark_cclass(cWhiteSpaceAnalyzer);
2347
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frb_data_alloc);
2348
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize",
2349
+ frb_white_space_analyzer_init, -1);
2350
+ }
2351
+
2352
+ /*
2353
+ * Document-class: Ferret::Analysis::AsciiStandardAnalyzer
2354
+ *
2355
+ * == Summary
2356
+ *
2357
+ * The AsciiStandardAnalyzer is the most advanced of the available
2358
+ * ASCII-analyzers. If it were implemented in Ruby it would look like this;
2359
+ *
2360
+ * class AsciiStandardAnalyzer
2361
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2362
+ * @lower = lower
2363
+ * @stop_words = stop_words
2364
+ * end
2365
+ *
2366
+ * def token_stream(field, str)
2367
+ * ts = AsciiStandardTokenizer.new(str)
2368
+ * ts = AsciiLowerCaseFilter.new(ts) if @lower
2369
+ * ts = StopFilter.new(ts, @stop_words)
2370
+ * ts = HyphenFilter.new(ts)
2371
+ * end
2372
+ * end
2373
+ *
2374
+ * As you can see it makes use of the AsciiStandardTokenizer and you can also
2375
+ * add your own list of stop-words if you wish. Note that this tokenizer
2376
+ * won't recognize non-ASCII characters so you should use the
2377
+ * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
2378
+ */
2379
+ static void Init_AsciiStandardAnalyzer(void)
2380
+ {
2381
+ cAsciiStandardAnalyzer =
2382
+ rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2383
+ frb_mark_cclass(cAsciiStandardAnalyzer);
2384
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frb_data_alloc);
2385
+ rb_define_method(cAsciiStandardAnalyzer, "initialize",
2386
+ frb_a_standard_analyzer_init, -1);
2387
+ }
2388
+
2389
+ /*
2390
+ * Document-class: Ferret::Analysis::StandardAnalyzer
2391
+ *
2392
+ * == Summary
2393
+ *
2394
+ * The StandardAnalyzer is the most advanced of the available analyzers. If
2395
+ * it were implemented in Ruby it would look like this;
2396
+ *
2397
+ * class StandardAnalyzer
2398
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2399
+ * @lower = lower
2400
+ * @stop_words = stop_words
2401
+ * end
2402
+ *
2403
+ * def token_stream(field, str)
2404
+ * ts = StandardTokenizer.new(str)
2405
+ * ts = LowerCaseFilter.new(ts) if @lower
2406
+ * ts = StopFilter.new(ts, @stop_words)
2407
+ * ts = HyphenFilter.new(ts)
2408
+ * end
2409
+ * end
2410
+ *
2411
+ * As you can see it makes use of the StandardTokenizer and you can also add
2412
+ * your own list of stopwords if you wish.
2413
+ */
2414
+ static void Init_StandardAnalyzer(void)
2415
+ {
2416
+ cStandardAnalyzer =
2417
+ rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2418
+ frb_mark_cclass(cStandardAnalyzer);
2419
+ rb_define_alloc_func(cStandardAnalyzer, frb_data_alloc);
2420
+ rb_define_method(cStandardAnalyzer, "initialize",
2421
+ frb_standard_analyzer_init, -1);
2422
+ }
2423
+
2424
+ /*
2425
+ * Document-class: Ferret::Analysis::PerFieldAnalyzer
2426
+ *
2427
+ * == Summary
2428
+ *
2429
+ * The PerFieldAnalyzer is for use when you want to analyze different fields
2430
+ * with different analyzers. With the PerFieldAnalyzer you can specify how
2431
+ * you want each field analyzed.
2432
+ *
2433
+ * === Example
2434
+ *
2435
+ * # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
2436
+ * pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
2437
+ *
2438
+ * # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
2439
+ * pfa[:title] = WhiteSpaceAnalyzer.new(false)
2440
+ *
2441
+ * # Use a custom analyzer on the :created_at field
2442
+ * pfa[:created_at] = DateAnalyzer.new
2443
+ */
2444
+ static void Init_PerFieldAnalyzer(void)
2445
+ {
2446
+ cPerFieldAnalyzer =
2447
+ rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2448
+ frb_mark_cclass(cPerFieldAnalyzer);
2449
+ rb_define_alloc_func(cPerFieldAnalyzer, frb_data_alloc);
2450
+ rb_define_method(cPerFieldAnalyzer, "initialize",
2451
+ frb_per_field_analyzer_init, 1);
2452
+ rb_define_method(cPerFieldAnalyzer, "add_field",
2453
+ frb_per_field_analyzer_add_field, 2);
2454
+ rb_define_method(cPerFieldAnalyzer, "[]=",
2455
+ frb_per_field_analyzer_add_field, 2);
2456
+ rb_define_method(cPerFieldAnalyzer, "token_stream",
2457
+ frb_pfa_analyzer_token_stream, 2);
2458
+ }
2459
+
2460
+ /*
2461
+ * Document-class: Ferret::Analysis::RegExpAnalyzer
2462
+ *
2463
+ * == Summary
2464
+ *
2465
+ * Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
2466
+ * implemented in Ruby it would look like this;
2467
+ *
2468
+ * class RegExpAnalyzer
2469
+ * def initialize(reg_exp, lower = true)
2470
+ * @lower = lower
2471
+ * @reg_exp = reg_exp
2472
+ * end
2473
+ *
2474
+ * def token_stream(field, str)
2475
+ * if @lower
2476
+ * return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
2477
+ * else
2478
+ * return RegExpTokenizer.new(str, reg_exp)
2479
+ * end
2480
+ * end
2481
+ * end
2482
+ *
2483
+ * === Example
2484
+ *
2485
+ * csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
2486
+ */
2487
+ static void Init_RegExpAnalyzer(void)
2488
+ {
2489
+ cRegExpAnalyzer =
2490
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2491
+ frb_mark_cclass(cRegExpAnalyzer);
2492
+ rb_define_alloc_func(cRegExpAnalyzer, frb_data_alloc);
2493
+ rb_define_method(cRegExpAnalyzer, "initialize",
2494
+ frb_re_analyzer_init, -1);
2495
+ rb_define_method(cRegExpAnalyzer, "token_stream",
2496
+ frb_re_analyzer_token_stream, 2);
2497
+ }
2498
+
2499
+ /* rdoc hack
2500
+ extern VALUE mFerret = rb_define_module("Ferret");
2501
+ */
2502
+
2503
+ /*
2504
+ * Document-module: Ferret::Analysis
2505
+ *
2506
+ * == Summary
2507
+ *
2508
+ * The Analysis module contains all the classes used to analyze and tokenize
2509
+ * the data to be indexed. There are three main classes you need to know
2510
+ * about when dealing with analysis; Analyzer, TokenStream and Token.
2511
+ *
2512
+ * == Classes
2513
+ *
2514
+ * === Analyzer
2515
+ *
2516
+ * Analyzers handle all of your tokenizing needs. You pass an Analyzer to the
2517
+ * indexing class when you create it and it will create the TokenStreams
2518
+ * necessary to tokenize the fields in the documents. Most of the time you
2519
+ * won't need to worry about TokenStreams and Tokens, one of the Analyzers
2520
+ * distributed with Ferret will do exactly what you need. Otherwise you'll
2521
+ * need to implement a custom analyzer.
2522
+ *
2523
+ * === TokenStream
2524
+ *
2525
+ * A TokenStream is an enumeration of Tokens. There are two standard types of
2526
+ * TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
2527
+ * turns it into a list of Tokens. A TokenFilter takes another TokenStream
2528
+ * and post-processes the Tokens. You can chain as many TokenFilters together
2529
+ * as you like but they always need to finish with a Tokenizer.
2530
+ *
2531
+ * === Token
2532
+ *
2533
+ * A Token is a single term from a document field. A token contains the text
2534
+ * representing the term as well as the start and end offset of the token.
2535
+ * The start and end offset will represent the token as it appears in the
2536
+ * source field. Some TokenFilters may change the text in the Token but the
2537
+ * start and end offsets should stay the same so (end - start) won't
2538
+ * necessarily be equal to the length of text in the token. For example using
2539
+ * a stemming TokenFilter the term "Beginning" might have start and end
2540
+ * offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
2541
+ * might be "begin" (after stemming).
2542
+ */
2543
+ void
2544
+ Init_Analysis(void)
2545
+ {
2546
+ mAnalysis = rb_define_module_under(mFerret, "Analysis");
2547
+
2548
+ /* TokenStream Methods */
2549
+ id_next = rb_intern("next");
2550
+ id_reset = rb_intern("text=");
2551
+ id_clone = rb_intern("clone");
2552
+ id_text = rb_intern("@text");
2553
+
2554
+ /* Analyzer Methods */
2555
+ id_token_stream = rb_intern("token_stream");
2556
+
2557
+ object_space = rb_hash_new();
2558
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2559
+
2560
+ /*** * * Locale stuff * * ***/
2561
+ rb_define_singleton_method(mFerret, "locale=", frb_set_locale, 1);
2562
+ rb_define_singleton_method(mFerret, "locale", frb_get_locale, 0);
2563
+
2564
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2565
+ get_rstopwords(ENGLISH_STOP_WORDS));
2566
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
2567
+ get_rstopwords(FULL_ENGLISH_STOP_WORDS));
2568
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
2569
+ get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
2570
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
2571
+ get_rstopwords(FULL_FRENCH_STOP_WORDS));
2572
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
2573
+ get_rstopwords(FULL_SPANISH_STOP_WORDS));
2574
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
2575
+ get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
2576
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
2577
+ get_rstopwords(FULL_ITALIAN_STOP_WORDS));
2578
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
2579
+ get_rstopwords(FULL_GERMAN_STOP_WORDS));
2580
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
2581
+ get_rstopwords(FULL_DUTCH_STOP_WORDS));
2582
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
2583
+ get_rstopwords(FULL_SWEDISH_STOP_WORDS));
2584
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
2585
+ get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
2586
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
2587
+ get_rstopwords(FULL_DANISH_STOP_WORDS));
2588
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
2589
+ get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
2590
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2591
+ get_rstopwords(FULL_FINNISH_STOP_WORDS));
2592
+ rb_define_const(mAnalysis, "FULL_HUNGARIAN_STOP_WORDS",
2593
+ get_rstopwords(FULL_HUNGARIAN_STOP_WORDS));
2594
+
2595
+ Init_Token();
2596
+ Init_TokenStream();
2597
+
2598
+ Init_AsciiLetterTokenizer();
2599
+ Init_LetterTokenizer();
2600
+
2601
+ Init_AsciiWhiteSpaceTokenizer();
2602
+ Init_WhiteSpaceTokenizer();
2603
+
2604
+ Init_AsciiStandardTokenizer();
2605
+ Init_StandardTokenizer();
2606
+
2607
+ Init_RegExpTokenizer();
2608
+
2609
+ Init_AsciiLowerCaseFilter();
2610
+ Init_LowerCaseFilter();
2611
+ Init_HyphenFilter();
2612
+ Init_StopFilter();
2613
+ Init_MappingFilter();
2614
+ Init_StemFilter();
2615
+
2616
+ Init_Analyzer();
2617
+ Init_AsciiLetterAnalyzer();
2618
+ Init_LetterAnalyzer();
2619
+ Init_AsciiWhiteSpaceAnalyzer();
2620
+ Init_WhiteSpaceAnalyzer();
2621
+ Init_AsciiStandardAnalyzer();
2622
+ Init_StandardAnalyzer();
2623
+ Init_PerFieldAnalyzer();
2624
+ Init_RegExpAnalyzer();
2625
+
2626
+ }