np-ferret 0.11.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (275) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/Makefile +218 -0
  9. data/ext/analysis.c +1584 -0
  10. data/ext/analysis.h +219 -0
  11. data/ext/analysis.o +0 -0
  12. data/ext/api.c +69 -0
  13. data/ext/api.h +27 -0
  14. data/ext/api.o +0 -0
  15. data/ext/array.c +123 -0
  16. data/ext/array.h +53 -0
  17. data/ext/array.o +0 -0
  18. data/ext/bitvector.c +540 -0
  19. data/ext/bitvector.h +272 -0
  20. data/ext/bitvector.o +0 -0
  21. data/ext/compound_io.c +383 -0
  22. data/ext/compound_io.o +0 -0
  23. data/ext/config.h +42 -0
  24. data/ext/document.c +156 -0
  25. data/ext/document.h +53 -0
  26. data/ext/document.o +0 -0
  27. data/ext/except.c +120 -0
  28. data/ext/except.h +168 -0
  29. data/ext/except.o +0 -0
  30. data/ext/extconf.rb +14 -0
  31. data/ext/ferret.c +402 -0
  32. data/ext/ferret.h +91 -0
  33. data/ext/ferret.o +0 -0
  34. data/ext/ferret_ext.bundle +0 -0
  35. data/ext/filter.c +156 -0
  36. data/ext/filter.o +0 -0
  37. data/ext/fs_store.c +484 -0
  38. data/ext/fs_store.o +0 -0
  39. data/ext/global.c +418 -0
  40. data/ext/global.h +117 -0
  41. data/ext/global.o +0 -0
  42. data/ext/hash.c +598 -0
  43. data/ext/hash.h +475 -0
  44. data/ext/hash.o +0 -0
  45. data/ext/hashset.c +170 -0
  46. data/ext/hashset.h +187 -0
  47. data/ext/hashset.o +0 -0
  48. data/ext/header.h +58 -0
  49. data/ext/helper.c +62 -0
  50. data/ext/helper.h +13 -0
  51. data/ext/helper.o +0 -0
  52. data/ext/inc/lang.h +48 -0
  53. data/ext/inc/threading.h +31 -0
  54. data/ext/index.c +6510 -0
  55. data/ext/index.h +964 -0
  56. data/ext/index.o +0 -0
  57. data/ext/lang.h +66 -0
  58. data/ext/libstemmer.c +92 -0
  59. data/ext/libstemmer.h +79 -0
  60. data/ext/libstemmer.o +0 -0
  61. data/ext/mempool.c +87 -0
  62. data/ext/mempool.h +35 -0
  63. data/ext/mempool.o +0 -0
  64. data/ext/modules.h +162 -0
  65. data/ext/multimapper.c +310 -0
  66. data/ext/multimapper.h +51 -0
  67. data/ext/multimapper.o +0 -0
  68. data/ext/posh.c +1006 -0
  69. data/ext/posh.h +1007 -0
  70. data/ext/posh.o +0 -0
  71. data/ext/priorityqueue.c +151 -0
  72. data/ext/priorityqueue.h +143 -0
  73. data/ext/priorityqueue.o +0 -0
  74. data/ext/q_boolean.c +1608 -0
  75. data/ext/q_boolean.o +0 -0
  76. data/ext/q_const_score.c +165 -0
  77. data/ext/q_const_score.o +0 -0
  78. data/ext/q_filtered_query.c +209 -0
  79. data/ext/q_filtered_query.o +0 -0
  80. data/ext/q_fuzzy.c +335 -0
  81. data/ext/q_fuzzy.o +0 -0
  82. data/ext/q_match_all.c +148 -0
  83. data/ext/q_match_all.o +0 -0
  84. data/ext/q_multi_term.c +677 -0
  85. data/ext/q_multi_term.o +0 -0
  86. data/ext/q_parser.c +2825 -0
  87. data/ext/q_parser.o +0 -0
  88. data/ext/q_phrase.c +1126 -0
  89. data/ext/q_phrase.o +0 -0
  90. data/ext/q_prefix.c +100 -0
  91. data/ext/q_prefix.o +0 -0
  92. data/ext/q_range.c +356 -0
  93. data/ext/q_range.o +0 -0
  94. data/ext/q_span.c +2402 -0
  95. data/ext/q_span.o +0 -0
  96. data/ext/q_term.c +337 -0
  97. data/ext/q_term.o +0 -0
  98. data/ext/q_wildcard.c +171 -0
  99. data/ext/q_wildcard.o +0 -0
  100. data/ext/r_analysis.c +2636 -0
  101. data/ext/r_analysis.o +0 -0
  102. data/ext/r_index.c +3509 -0
  103. data/ext/r_index.o +0 -0
  104. data/ext/r_qparser.c +585 -0
  105. data/ext/r_qparser.o +0 -0
  106. data/ext/r_search.c +4240 -0
  107. data/ext/r_search.o +0 -0
  108. data/ext/r_store.c +513 -0
  109. data/ext/r_store.o +0 -0
  110. data/ext/r_utils.c +963 -0
  111. data/ext/r_utils.o +0 -0
  112. data/ext/ram_store.c +471 -0
  113. data/ext/ram_store.o +0 -0
  114. data/ext/search.c +1743 -0
  115. data/ext/search.h +885 -0
  116. data/ext/search.o +0 -0
  117. data/ext/similarity.c +150 -0
  118. data/ext/similarity.h +82 -0
  119. data/ext/similarity.o +0 -0
  120. data/ext/sort.c +985 -0
  121. data/ext/sort.o +0 -0
  122. data/ext/stem_ISO_8859_1_danish.c +338 -0
  123. data/ext/stem_ISO_8859_1_danish.h +16 -0
  124. data/ext/stem_ISO_8859_1_danish.o +0 -0
  125. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  126. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.o +0 -0
  128. data/ext/stem_ISO_8859_1_english.c +1156 -0
  129. data/ext/stem_ISO_8859_1_english.h +16 -0
  130. data/ext/stem_ISO_8859_1_english.o +0 -0
  131. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  132. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  133. data/ext/stem_ISO_8859_1_finnish.o +0 -0
  134. data/ext/stem_ISO_8859_1_french.c +1276 -0
  135. data/ext/stem_ISO_8859_1_french.h +16 -0
  136. data/ext/stem_ISO_8859_1_french.o +0 -0
  137. data/ext/stem_ISO_8859_1_german.c +512 -0
  138. data/ext/stem_ISO_8859_1_german.h +16 -0
  139. data/ext/stem_ISO_8859_1_german.o +0 -0
  140. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  141. data/ext/stem_ISO_8859_1_italian.h +16 -0
  142. data/ext/stem_ISO_8859_1_italian.o +0 -0
  143. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  144. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  145. data/ext/stem_ISO_8859_1_norwegian.o +0 -0
  146. data/ext/stem_ISO_8859_1_porter.c +776 -0
  147. data/ext/stem_ISO_8859_1_porter.h +16 -0
  148. data/ext/stem_ISO_8859_1_porter.o +0 -0
  149. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  150. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  151. data/ext/stem_ISO_8859_1_portuguese.o +0 -0
  152. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  153. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  154. data/ext/stem_ISO_8859_1_spanish.o +0 -0
  155. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  156. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  157. data/ext/stem_ISO_8859_1_swedish.o +0 -0
  158. data/ext/stem_KOI8_R_russian.c +701 -0
  159. data/ext/stem_KOI8_R_russian.h +16 -0
  160. data/ext/stem_KOI8_R_russian.o +0 -0
  161. data/ext/stem_UTF_8_danish.c +344 -0
  162. data/ext/stem_UTF_8_danish.h +16 -0
  163. data/ext/stem_UTF_8_danish.o +0 -0
  164. data/ext/stem_UTF_8_dutch.c +653 -0
  165. data/ext/stem_UTF_8_dutch.h +16 -0
  166. data/ext/stem_UTF_8_dutch.o +0 -0
  167. data/ext/stem_UTF_8_english.c +1176 -0
  168. data/ext/stem_UTF_8_english.h +16 -0
  169. data/ext/stem_UTF_8_english.o +0 -0
  170. data/ext/stem_UTF_8_finnish.c +808 -0
  171. data/ext/stem_UTF_8_finnish.h +16 -0
  172. data/ext/stem_UTF_8_finnish.o +0 -0
  173. data/ext/stem_UTF_8_french.c +1296 -0
  174. data/ext/stem_UTF_8_french.h +16 -0
  175. data/ext/stem_UTF_8_french.o +0 -0
  176. data/ext/stem_UTF_8_german.c +526 -0
  177. data/ext/stem_UTF_8_german.h +16 -0
  178. data/ext/stem_UTF_8_german.o +0 -0
  179. data/ext/stem_UTF_8_italian.c +1113 -0
  180. data/ext/stem_UTF_8_italian.h +16 -0
  181. data/ext/stem_UTF_8_italian.o +0 -0
  182. data/ext/stem_UTF_8_norwegian.c +302 -0
  183. data/ext/stem_UTF_8_norwegian.h +16 -0
  184. data/ext/stem_UTF_8_norwegian.o +0 -0
  185. data/ext/stem_UTF_8_porter.c +794 -0
  186. data/ext/stem_UTF_8_porter.h +16 -0
  187. data/ext/stem_UTF_8_porter.o +0 -0
  188. data/ext/stem_UTF_8_portuguese.c +1055 -0
  189. data/ext/stem_UTF_8_portuguese.h +16 -0
  190. data/ext/stem_UTF_8_portuguese.o +0 -0
  191. data/ext/stem_UTF_8_russian.c +709 -0
  192. data/ext/stem_UTF_8_russian.h +16 -0
  193. data/ext/stem_UTF_8_russian.o +0 -0
  194. data/ext/stem_UTF_8_spanish.c +1137 -0
  195. data/ext/stem_UTF_8_spanish.h +16 -0
  196. data/ext/stem_UTF_8_spanish.o +0 -0
  197. data/ext/stem_UTF_8_swedish.c +313 -0
  198. data/ext/stem_UTF_8_swedish.h +16 -0
  199. data/ext/stem_UTF_8_swedish.o +0 -0
  200. data/ext/stopwords.c +401 -0
  201. data/ext/stopwords.o +0 -0
  202. data/ext/store.c +692 -0
  203. data/ext/store.h +777 -0
  204. data/ext/store.o +0 -0
  205. data/ext/term_vectors.c +352 -0
  206. data/ext/term_vectors.o +0 -0
  207. data/ext/threading.h +31 -0
  208. data/ext/utilities.c +446 -0
  209. data/ext/utilities.o +0 -0
  210. data/ext/win32.h +54 -0
  211. data/ferret.gemspec +39 -0
  212. data/lib/ferret.rb +29 -0
  213. data/lib/ferret/browser.rb +246 -0
  214. data/lib/ferret/browser/s/global.js +192 -0
  215. data/lib/ferret/browser/s/style.css +148 -0
  216. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  217. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  218. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  219. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  220. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  221. data/lib/ferret/browser/views/layout.rhtml +22 -0
  222. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  223. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  224. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  225. data/lib/ferret/browser/webrick.rb +14 -0
  226. data/lib/ferret/document.rb +130 -0
  227. data/lib/ferret/field_infos.rb +44 -0
  228. data/lib/ferret/index.rb +786 -0
  229. data/lib/ferret/number_tools.rb +157 -0
  230. data/lib/ferret_ext.bundle +0 -0
  231. data/lib/ferret_version.rb +3 -0
  232. data/pkg/ferret-0.11.6.gem +0 -0
  233. data/pkg/ferret-0.11.6.tgz +0 -0
  234. data/pkg/ferret-0.11.6.zip +0 -0
  235. data/setup.rb +1555 -0
  236. data/test/test_all.rb +5 -0
  237. data/test/test_helper.rb +24 -0
  238. data/test/threading/number_to_spoken.rb +132 -0
  239. data/test/threading/thread_safety_index_test.rb +79 -0
  240. data/test/threading/thread_safety_read_write_test.rb +76 -0
  241. data/test/threading/thread_safety_test.rb +133 -0
  242. data/test/unit/analysis/tc_analyzer.rb +548 -0
  243. data/test/unit/analysis/tc_token_stream.rb +646 -0
  244. data/test/unit/index/tc_index.rb +762 -0
  245. data/test/unit/index/tc_index_reader.rb +699 -0
  246. data/test/unit/index/tc_index_writer.rb +437 -0
  247. data/test/unit/index/th_doc.rb +315 -0
  248. data/test/unit/largefile/tc_largefile.rb +46 -0
  249. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  250. data/test/unit/search/tc_filter.rb +135 -0
  251. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  252. data/test/unit/search/tc_index_searcher.rb +61 -0
  253. data/test/unit/search/tc_multi_searcher.rb +128 -0
  254. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  255. data/test/unit/search/tc_search_and_sort.rb +179 -0
  256. data/test/unit/search/tc_sort.rb +49 -0
  257. data/test/unit/search/tc_sort_field.rb +27 -0
  258. data/test/unit/search/tc_spans.rb +190 -0
  259. data/test/unit/search/tm_searcher.rb +384 -0
  260. data/test/unit/store/tc_fs_store.rb +77 -0
  261. data/test/unit/store/tc_ram_store.rb +35 -0
  262. data/test/unit/store/tm_store.rb +34 -0
  263. data/test/unit/store/tm_store_lock.rb +68 -0
  264. data/test/unit/tc_document.rb +81 -0
  265. data/test/unit/ts_analysis.rb +2 -0
  266. data/test/unit/ts_index.rb +2 -0
  267. data/test/unit/ts_largefile.rb +4 -0
  268. data/test/unit/ts_query_parser.rb +2 -0
  269. data/test/unit/ts_search.rb +2 -0
  270. data/test/unit/ts_store.rb +2 -0
  271. data/test/unit/ts_utils.rb +2 -0
  272. data/test/unit/utils/tc_bit_vector.rb +295 -0
  273. data/test/unit/utils/tc_number_tools.rb +117 -0
  274. data/test/unit/utils/tc_priority_queue.rb +106 -0
  275. metadata +392 -0
data/ext/q_wildcard.o ADDED
Binary file
data/ext/r_analysis.c ADDED
@@ -0,0 +1,2636 @@
1
+ #include "lang.h"
2
+ #ifdef FRT_RUBY_VERSION_1_9
3
+ # include <ruby/re.h>
4
+ #else
5
+ # include <regex.h>
6
+ #endif
7
+ #include <locale.h>
8
+ #include <ruby/st.h>
9
+ #include "ferret.h"
10
+ #include "analysis.h"
11
+
12
+ static char *frt_locale = NULL;
13
+
14
+ static VALUE mAnalysis;
15
+
16
+ static VALUE cToken;
17
+ static VALUE cAsciiLetterTokenizer;
18
+ static VALUE cLetterTokenizer;
19
+ static VALUE cAsciiWhiteSpaceTokenizer;
20
+ static VALUE cWhiteSpaceTokenizer;
21
+ static VALUE cAsciiStandardTokenizer;
22
+ static VALUE cStandardTokenizer;
23
+ static VALUE cRegExpTokenizer;
24
+
25
+ static VALUE cAsciiLowerCaseFilter;
26
+ static VALUE cLowerCaseFilter;
27
+ static VALUE cStopFilter;
28
+ static VALUE cMappingFilter;
29
+ static VALUE cHyphenFilter;
30
+ static VALUE cStemFilter;
31
+
32
+ static VALUE cAnalyzer;
33
+ static VALUE cAsciiLetterAnalyzer;
34
+ static VALUE cLetterAnalyzer;
35
+ static VALUE cAsciiWhiteSpaceAnalyzer;
36
+ static VALUE cWhiteSpaceAnalyzer;
37
+ static VALUE cAsciiStandardAnalyzer;
38
+ static VALUE cStandardAnalyzer;
39
+ static VALUE cPerFieldAnalyzer;
40
+ static VALUE cRegExpAnalyzer;
41
+
42
+ static VALUE cTokenStream;
43
+
44
+ /* TokenStream Methods */
45
+ static ID id_next;
46
+ static ID id_reset;
47
+ static ID id_clone;
48
+ static ID id_text;
49
+
50
+ /* Analyzer Methods */
51
+ static ID id_token_stream;
52
+
53
+ static VALUE object_space;
54
+
55
+ #ifndef FRT_RUBY_VERSION_1_9
56
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
57
+ int, struct re_registers *);
58
+ #endif
59
+
60
+ int
61
+ frt_rb_hash_size(VALUE hash)
62
+ {
63
+ return RHASH(hash)->ntbl->num_entries;
64
+ }
65
+
66
+ /****************************************************************************
67
+ *
68
+ * Utility Methods
69
+ *
70
+ ****************************************************************************/
71
+
72
+ static char **
73
+ get_stopwords(VALUE rstop_words)
74
+ {
75
+ char **stop_words;
76
+ int i, len;
77
+ VALUE rstr;
78
+ Check_Type(rstop_words, T_ARRAY);
79
+ len = RARRAY_LEN(rstop_words);
80
+ stop_words = ALLOC_N(char *, RARRAY_LEN(rstop_words) + 1);
81
+ stop_words[len] = NULL;
82
+ for (i = 0; i < len; i++) {
83
+ rstr = rb_obj_as_string(RARRAY_PTR(rstop_words)[i]);
84
+ stop_words[i] = rs2s(rstr);
85
+ }
86
+ return stop_words;
87
+ }
88
+
89
+ /****************************************************************************
90
+ *
91
+ * token methods
92
+ *
93
+ ****************************************************************************/
94
+
95
+ typedef struct RToken {
96
+ VALUE text;
97
+ int start;
98
+ int end;
99
+ int pos_inc;
100
+ } RToken;
101
+
102
+ static void
103
+ frt_token_free(void *p)
104
+ {
105
+ free(p);
106
+ }
107
+
108
+ static void
109
+ frt_token_mark(void *p)
110
+ {
111
+ RToken *token = (RToken *)p;
112
+ rb_gc_mark(token->text);
113
+ }
114
+
115
+ static VALUE
116
+ frt_token_alloc(VALUE klass)
117
+ {
118
+ return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free,
119
+ ALLOC(RToken));
120
+ }
121
+
122
+ static VALUE
123
+ get_token(Token *tk)
124
+ {
125
+ RToken *token = ALLOC(RToken);
126
+
127
+ token->text = rb_str_new2(tk->text);
128
+ token->start = tk->start;
129
+ token->end = tk->end;
130
+ token->pos_inc = tk->pos_inc;
131
+ return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
132
+ }
133
+
134
+ Token *
135
+ frt_set_token(Token *tk, VALUE rt)
136
+ {
137
+ RToken *rtk;
138
+
139
+ if (rt == Qnil) return NULL;
140
+
141
+ Data_Get_Struct(rt, RToken, rtk);
142
+ tk_set(tk, rs2s(rtk->text), RSTRING_LEN(rtk->text),
143
+ rtk->start, rtk->end, rtk->pos_inc);
144
+ return tk;
145
+ }
146
+
147
+ #define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
148
+
149
+ /*
150
+ * call-seq:
151
+ * Token.new(text, start, end, pos_inc = 1) -> new Token
152
+ *
153
+ * Creates a new token setting the text, start and end offsets of the token
154
+ * and the position increment for the token.
155
+ *
156
+ * The position increment is usually set to 1 but you can set it to other
157
+ * values as needed. For example, if you have a stop word filter you will be
158
+ * skipping tokens. Let's say you have the stop words "the" and "and" and you
159
+ * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
160
+ * "Sea" will have the position increments 2, 1 and 3 respectively.
161
+ *
162
+ * Another reason you might want to vary the position increment is if you are
163
+ * adding synonyms to the index. For example let's say you have the synonym
164
+ * group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
165
+ * speedy delivery", you'll add "speedy" first with a position increment of 1
166
+ * and then "fast" and "quick" with position increments of 0 since they are
167
+ * represented in the same position.
168
+ *
169
+ * The offset set values +start+ and +end+ should be byte offsets, not
170
+ * character offsets. This makes it easy to use those offsets to quickly
171
+ * access the token in the input string and also to insert highlighting tags
172
+ * when necessary.
173
+ *
174
+ * text:: the main text for the token.
175
+ * start:: the start offset of the token in bytes.
176
+ * end:: the end offset of the token in bytes.
177
+ * pos_inc:: the position increment of a token. See above.
178
+ * return:: a newly created and assigned Token object
179
+ */
180
+ static VALUE
181
+ frt_token_init(int argc, VALUE *argv, VALUE self)
182
+ {
183
+ RToken *token;
184
+ VALUE rtext, rstart, rend, rpos_inc, rtype;
185
+ GET_TK(token, self);
186
+ token->pos_inc = 1;
187
+ switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
188
+ &rend, &rpos_inc, &rtype)) {
189
+ case 5: /* type gets ignored at this stage */
190
+ case 4: token->pos_inc = FIX2INT(rpos_inc);
191
+ }
192
+ token->text = rb_obj_as_string(rtext);
193
+ token->start = FIX2INT(rstart);
194
+ token->end = FIX2INT(rend);
195
+ return self;
196
+ }
197
+
198
+ /*
199
+ * call-seq:
200
+ * token.cmp(other_token) -> bool
201
+ *
202
+ * Used to compare two tokens. Token is extended by Comparable so you can
203
+ * also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
204
+ *
205
+ * Tokens are sorted by the position in the text at which they occur, ie
206
+ * the start offset. If two tokens have the same start offset, (see
207
+ * pos_inc=) then, they are sorted by the end offset and then
208
+ * lexically by the token text.
209
+ */
210
+ static VALUE
211
+ frt_token_cmp(VALUE self, VALUE rother)
212
+ {
213
+ RToken *token, *other;
214
+ int cmp;
215
+ GET_TK(token, self);
216
+ GET_TK(other, rother);
217
+ if (token->start > other->start) {
218
+ cmp = 1;
219
+ } else if (token->start < other->start) {
220
+ cmp = -1;
221
+ } else {
222
+ if (token->end > other->end) {
223
+ cmp = 1;
224
+ } else if (token->end < other->end) {
225
+ cmp = -1;
226
+ } else {
227
+ cmp = strcmp(rs2s(token->text), rs2s(other->text));
228
+ }
229
+ }
230
+ return INT2FIX(cmp);
231
+ }
232
+
233
+ /*
234
+ * call-seq:
235
+ * token.text -> text
236
+ *
237
+ * Returns the text that this token represents
238
+ */
239
+ static VALUE
240
+ frt_token_get_text(VALUE self)
241
+ {
242
+ RToken *token;
243
+ GET_TK(token, self);
244
+ return token->text;
245
+ }
246
+
247
+ /*
248
+ * call-seq:
249
+ * token.text = text -> text
250
+ *
251
+ * Set the text for this token.
252
+ */
253
+ static VALUE
254
+ frt_token_set_text(VALUE self, VALUE rtext)
255
+ {
256
+ RToken *token;
257
+ GET_TK(token, self);
258
+ token->text = rtext;
259
+ return rtext;
260
+ }
261
+
262
+ /*
263
+ * call-seq:
264
+ * token.start -> integer
265
+ *
266
+ * Start byte-position of this token
267
+ */
268
+ static VALUE
269
+ frt_token_get_start_offset(VALUE self)
270
+ {
271
+ RToken *token;
272
+ GET_TK(token, self);
273
+ return INT2FIX(token->start);
274
+ }
275
+
276
+ /*
277
+ * call-seq:
278
+ * token.end -> integer
279
+ *
280
+ * End byte-position of this token
281
+ */
282
+ static VALUE
283
+ frt_token_get_end_offset(VALUE self)
284
+ {
285
+ RToken *token;
286
+ GET_TK(token, self);
287
+ return INT2FIX(token->end);
288
+ }
289
+
290
+ /*
291
+ * call-seq:
292
+ * token.pos_inc -> integer
293
+ *
294
+ * Position Increment for this token
295
+ */
296
+ static VALUE
297
+ frt_token_get_pos_inc(VALUE self)
298
+ {
299
+ RToken *token;
300
+ GET_TK(token, self);
301
+ return INT2FIX(token->pos_inc);
302
+ }
303
+
304
+ /*
305
+ * call-seq:
306
+ * token.start = start -> integer
307
+ *
308
+ * Set start byte-position of this token
309
+ */
310
+ static VALUE
311
+ frt_token_set_start_offset(VALUE self, VALUE rstart)
312
+ {
313
+ RToken *token;
314
+ GET_TK(token, self);
315
+ token->start = FIX2INT(rstart);
316
+ return rstart;
317
+ }
318
+
319
+ /*
320
+ * call-seq:
321
+ * token.end = end -> integer
322
+ *
323
+ * Set end byte-position of this token
324
+ */
325
+ static VALUE
326
+ frt_token_set_end_offset(VALUE self, VALUE rend)
327
+ {
328
+ RToken *token;
329
+ GET_TK(token, self);
330
+ token->end = FIX2INT(rend);
331
+ return rend;
332
+ }
333
+
334
+ /*
335
+ * call-seq:
336
+ * token.pos_inc = pos_inc -> integer
337
+ *
338
+ * Set the position increment. This determines the position of this token
339
+ * relative to the previous Token in a TokenStream, used in phrase
340
+ * searching.
341
+ *
342
+ * The default value is 1.
343
+ *
344
+ * Some common uses for this are:
345
+ *
346
+ * * Set it to zero to put multiple terms in the same position. This is
347
+ * useful if, e.g., a word has multiple stems. Searches for phrases
348
+ * including either stem will match. In this case, all but the first
349
+ * stem's increment should be set to zero: the increment of the first
350
+ * instance should be one. Repeating a token with an increment of zero
351
+ * can also be used to boost the scores of matches on that token.
352
+ *
353
+ * * Set it to values greater than one to inhibit exact phrase matches.
354
+ * If, for example, one does not want phrases to match across removed
355
+ * stop words, then one could build a stop word filter that removes stop
356
+ * words and also sets the increment to the number of stop words removed
357
+ * before each non-stop word. Then exact phrase queries will only match
358
+ * when the terms occur with no intervening stop words.
359
+ *
360
+ */
361
+ static VALUE
362
+ frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
363
+ {
364
+ RToken *token;
365
+ GET_TK(token, self);
366
+ token->pos_inc = FIX2INT(rpos_inc);
367
+ return rpos_inc;
368
+ }
369
+
370
+ /*
371
+ * call-seq:
372
+ * token.to_s -> token_str
373
+ *
374
+ * Return a string representation of the token
375
+ */
376
+ static VALUE
377
+ frt_token_to_s(VALUE self)
378
+ {
379
+ RToken *token;
380
+ char *buf;
381
+ GET_TK(token, self);
382
+ buf = alloca(RSTRING_LEN(token->text) + 80);
383
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
384
+ token->start, token->end, token->pos_inc);
385
+ return rb_str_new2(buf);
386
+ }
387
+
388
+ /****************************************************************************
389
+ *
390
+ * TokenStream Methods
391
+ *
392
+ ****************************************************************************/
393
+
394
+ #define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
395
+
396
+ static void
397
+ frt_ts_mark(void *p)
398
+ {
399
+ TokenStream *ts = (TokenStream *)p;
400
+ if (ts->text) frt_gc_mark(&ts->text);
401
+ }
402
+
403
+ static void
404
+ frt_ts_free(TokenStream *ts)
405
+ {
406
+ if (object_get(&ts->text) != Qnil) {
407
+ object_del(&ts->text);
408
+ }
409
+ object_del(ts);
410
+ ts_deref(ts);
411
+ }
412
+
413
+ static void frt_rets_free(TokenStream *ts);
414
+ static void frt_rets_mark(TokenStream *ts);
415
+ static Token *rets_next(TokenStream *ts);
416
+
417
+ static VALUE
418
+ get_rb_token_stream(TokenStream *ts)
419
+ {
420
+ VALUE rts = object_get(ts);
421
+ if (rts == Qnil) {
422
+ if (ts->next == &rets_next) {
423
+ rts = Data_Wrap_Struct(cTokenStream, &frt_rets_mark,
424
+ &frt_rets_free, ts);
425
+ } else {
426
+ rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark,
427
+ &frt_ts_free, ts);
428
+ }
429
+ object_add(ts, rts);
430
+ }
431
+ return rts;
432
+ }
433
+
434
+ static INLINE VALUE
435
+ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
436
+ {
437
+ StringValue(rstr);
438
+ ts->reset(ts, rs2s(rstr));
439
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
440
+ object_add(&ts->text, rstr);
441
+ object_add(ts, self);
442
+ return self;
443
+ }
444
+
445
+ /*
446
+ * call-seq:
447
+ * token_stream.text = text -> text
448
+ *
449
+ * Set the text attribute of the TokenStream to the text you wish to be
450
+ * tokenized. For example, you may do this;
451
+ *
452
+ * token_stream.text = File.read(file_name)
453
+ */
454
+ static VALUE
455
+ frt_ts_set_text(VALUE self, VALUE rtext)
456
+ {
457
+ TokenStream *ts;
458
+ Data_Get_Struct(self, TokenStream, ts);
459
+ StringValue(rtext);
460
+ ts->reset(ts, rs2s(rtext));
461
+
462
+ /* prevent garbage collection */
463
+ rb_ivar_set(self, id_text, rtext);
464
+
465
+ return rtext;
466
+ }
467
+
468
+ /*
469
+ * call-seq:
470
+ * token_stream.text = text -> text
471
+ *
472
+ * Return the text that the TokenStream is tokenizing
473
+ */
474
+ static VALUE
475
+ frt_ts_get_text(VALUE self)
476
+ {
477
+ VALUE rtext = Qnil;
478
+ TokenStream *ts;
479
+ Data_Get_Struct(self, TokenStream, ts);
480
+ if ((rtext = object_get(&ts->text)) == Qnil) {
481
+ if (ts->text) {
482
+ rtext = rb_str_new2(ts->text);
483
+ object_set(&ts->text, rtext);
484
+ }
485
+ }
486
+ return rtext;
487
+ }
488
+
489
+ /*
490
+ * call-seq:
491
+ * token_stream.next -> token
492
+ *
493
+ * Return the next token from the TokenStream or nil if there are no more
494
+ * tokens.
495
+ */
496
+ static VALUE
497
+ frt_ts_next(VALUE self)
498
+ {
499
+ TokenStream *ts;
500
+ Token *next;
501
+ GET_TS(ts, self);
502
+ next = ts->next(ts);
503
+ if (next == NULL) {
504
+ return Qnil;
505
+ }
506
+
507
+ return get_token(next);
508
+ }
509
+
510
+ /*
511
+ * call-seq:
512
+ * token_stream.positions(term) -> Array of positions
513
+ *
514
+ * Get a list of what positions are occupied by the specified term in the stream,
515
+ * as well as their starting offsets within the string. Each term found will
516
+ * add two numbers to the result array -- the first is the term position, the second
517
+ * is the term offset.
518
+ *
519
+ * token_stream.positions('the') => [ 10,57, 14,73, 77,407, 92,462 ]
520
+ */
521
+ static VALUE
522
+ frt_ts_positions(VALUE self, VALUE rterm)
523
+ {
524
+ TokenStream *ts;
525
+ Token *next;
526
+ char *term = rs2s(rb_obj_as_string(rterm));
527
+ VALUE rpositions = rb_ary_new();
528
+ int position = 0;
529
+
530
+ GET_TS(ts, self);
531
+ while (NULL != (next = ts->next(ts))) {
532
+ if (next->text[0] == term[0] && strcmp(next->text, term) == 0) {
533
+ rb_ary_push(rpositions, INT2FIX(position));
534
+ rb_ary_push(rpositions, INT2FIX(next->start));
535
+ }
536
+ position += next->pos_inc;
537
+ }
538
+
539
+ return rpositions;
540
+ }
541
+
542
+ /*
543
+ * call-seq:
544
+ * token_stream.term_offsets(term) -> Array of offsets
545
+ *
546
+ * Get a list of what string offsets the specified term can be found at in the
547
+ * token stream. Returns an array of the offsets.
548
+ *
549
+ * token_stream.term_offsets('the') => [ 57, 73, 407, 462 ]
550
+ */
551
+ static VALUE
552
+ frt_ts_term_offsets(VALUE self, VALUE rterm)
553
+ {
554
+ TokenStream *ts;
555
+ Token *next;
556
+ char *term = rs2s(rb_obj_as_string(rterm));
557
+ VALUE roffsets = rb_ary_new();
558
+
559
+ GET_TS(ts, self);
560
+ while (NULL != (next = ts->next(ts))) {
561
+ if (next->text[0] == term[0] && strcmp(next->text, term) == 0) {
562
+ rb_ary_push(roffsets, INT2FIX(next->start));
563
+ }
564
+ }
565
+
566
+ return roffsets;
567
+ }
568
+
569
+ /****************************************************************************
570
+ * TokenFilter
571
+ ****************************************************************************/
572
+
573
+ #define TkFilt(filter) ((TokenFilter *)(filter))
574
+
575
+ static void
576
+ frt_tf_mark(void *p)
577
+ {
578
+ TokenStream *ts = (TokenStream *)p;
579
+ if (TkFilt(ts)->sub_ts) {
580
+ frt_gc_mark(&TkFilt(ts)->sub_ts);
581
+ }
582
+ }
583
+
584
+ static void
585
+ frt_tf_free(TokenStream *ts)
586
+ {
587
+ if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
588
+ object_del(&TkFilt(ts)->sub_ts);
589
+ }
590
+ object_del(ts);
591
+ ts_deref(ts);
592
+ }
593
+
594
+
595
+ /****************************************************************************
596
+ * CWrappedTokenStream
597
+ ****************************************************************************/
598
+
599
+ #define CachedTS(token_stream) ((CachedTokenStream *)(token_stream))
600
+ #define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
601
+
602
+ typedef struct CWrappedTokenStream {
603
+ CachedTokenStream super;
604
+ VALUE rts;
605
+ } CWrappedTokenStream;
606
+
607
+ static void
608
+ cwrts_destroy_i(TokenStream *ts)
609
+ {
610
+ if (object_get(&ts->text) != Qnil) {
611
+ object_del(&ts->text);
612
+ }
613
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
614
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
615
+ free(ts);
616
+ }
617
+
618
+ static Token *
619
+ cwrts_next(TokenStream *ts)
620
+ {
621
+ VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
622
+ return frt_set_token(&(CachedTS(ts)->token), rtoken);
623
+ }
624
+
625
+ static TokenStream *
626
+ cwrts_reset(TokenStream *ts, char *text)
627
+ {
628
+ ts->t = ts->text = text;
629
+ rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
630
+ return ts;
631
+ }
632
+
633
+ static TokenStream *
634
+ cwrts_clone_i(TokenStream *orig_ts)
635
+ {
636
+ TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
637
+ VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
638
+ rb_hash_aset(object_space, ((VALUE)new_ts)|1, rts);
639
+ return new_ts;
640
+ }
641
+
642
+ static TokenStream *
643
+ frt_get_cwrapped_rts(VALUE rts)
644
+ {
645
+ TokenStream *ts;
646
+ if (frt_is_cclass(rts) && DATA_PTR(rts)) {
647
+ GET_TS(ts, rts);
648
+ REF(ts);
649
+ }
650
+ else {
651
+ ts = ts_new(CWrappedTokenStream);
652
+ CWTS(ts)->rts = rts;
653
+ ts->next = &cwrts_next;
654
+ ts->reset = &cwrts_reset;
655
+ ts->clone_i = &cwrts_clone_i;
656
+ ts->destroy_i = &cwrts_destroy_i;
657
+ /* prevent from being garbage collected */
658
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rts);
659
+ ts->ref_cnt = 1;
660
+ }
661
+ return ts;
662
+ }
663
+
664
+ /****************************************************************************
665
+ * RegExpTokenStream
666
+ ****************************************************************************/
667
+
668
+ #define P "[_\\/.,-]"
669
+ #define HASDIGIT "\\w*\\d\\w*"
670
+ #define ALPHA "[-_[:alpha:]]"
671
+ #define ALNUM "[-_[:alnum:]]"
672
+
673
+ #define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
674
+
675
+ static const char *TOKEN_RE =
676
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
677
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
678
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
679
+ "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
680
+ "|(\\.\\w+)+"
681
+ "|"
682
+ ")";
683
+ static VALUE rtoken_re;
684
+
685
+ typedef struct RegExpTokenStream {
686
+ CachedTokenStream super;
687
+ VALUE rtext;
688
+ VALUE regex;
689
+ VALUE proc;
690
+ long curr_ind;
691
+ } RegExpTokenStream;
692
+
693
+ static void
694
+ rets_destroy_i(TokenStream *ts)
695
+ {
696
+ if (object_get(&ts->text) != Qnil) {
697
+ object_del(&ts->text);
698
+ }
699
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
700
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
701
+ free(ts);
702
+ }
703
+
704
+ static void
705
+ frt_rets_free(TokenStream *ts)
706
+ {
707
+ if (object_get(&ts->text) != Qnil) {
708
+ object_del(&ts->text);
709
+ }
710
+ object_del(ts);
711
+ ts_deref(ts);
712
+ }
713
+
714
+ static void
715
+ frt_rets_mark(TokenStream *ts)
716
+ {
717
+ if (ts->text) frt_gc_mark(&ts->text);
718
+ rb_gc_mark(RETS(ts)->rtext);
719
+ rb_gc_mark(RETS(ts)->regex);
720
+ rb_gc_mark(RETS(ts)->proc);
721
+ }
722
+
723
+ /*
724
+ * call-seq:
725
+ * tokenizer.text = text -> text
726
+ *
727
+ * Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
728
+ * tokenize the text from the beginning.
729
+ */
730
+ static VALUE
731
+ frt_rets_set_text(VALUE self, VALUE rtext)
732
+ {
733
+ TokenStream *ts;
734
+ GET_TS(ts, self);
735
+
736
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
737
+ StringValue(rtext);
738
+ RETS(ts)->rtext = rtext;
739
+ RETS(ts)->curr_ind = 0;
740
+
741
+ return rtext;
742
+ }
743
+
744
+ /*
745
+ * call-seq:
746
+ * tokenizer.text = text -> text
747
+ *
748
+ * Get the text being tokenized by the tokenizer.
749
+ */
750
+ static VALUE
751
+ frt_rets_get_text(VALUE self)
752
+ {
753
+ TokenStream *ts;
754
+ GET_TS(ts, self);
755
+ return RETS(ts)->rtext;
756
+ }
757
+
758
+ #ifdef FRT_RUBY_VERSION_1_9
759
+
760
+ // partly lifted from ruby 1.9 string.c
761
+ #include <ruby/encoding.h>
762
+ #define BEG(no) regs->beg[no]
763
+ #define END(no) regs->end[no]
764
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
765
+ static VALUE
766
+ scan_once(VALUE str, VALUE pat, long *start)
767
+ {
768
+ VALUE match;
769
+ struct re_registers *regs;
770
+
771
+ if (rb_reg_search(pat, str, *start, 0) >= 0) {
772
+ match = rb_backref_get();
773
+ regs = RMATCH_REGS(match);
774
+ if (BEG(0) == END(0)) {
775
+ rb_encoding *enc = STR_ENC_GET(str);
776
+ /*
777
+ * Always consume at least one character of the input string
778
+ */
779
+ if (RSTRING_LEN(str) > END(0))
780
+ *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
781
+ RSTRING_END(str), enc);
782
+ else
783
+ *start = END(0)+1;
784
+ }
785
+ else {
786
+ *start = END(0);
787
+ }
788
+ return rb_reg_nth_match(0, match);
789
+ }
790
+ return Qnil;
791
+ }
792
+ //
793
+
794
+ static Token *
795
+ rets_next(TokenStream *ts)
796
+ {
797
+ VALUE ret;
798
+ long rtok_len;
799
+ int beg, end;
800
+ Check_Type(RETS(ts)->regex, T_REGEXP);
801
+ ret = scan_once(RETS(ts)->rtext, RETS(ts)->regex, &(RETS(ts)->curr_ind));
802
+ if (NIL_P(ret)) return NULL;
803
+
804
+ Check_Type(ret, T_STRING);
805
+ rtok_len = RSTRING_LEN(ret);
806
+ beg = RETS(ts)->curr_ind - rtok_len;
807
+ end = RETS(ts)->curr_ind;
808
+
809
+ if (NIL_P(RETS(ts)->proc)) {
810
+ return tk_set(&(CachedTS(ts)->token), rs2s(ret), rtok_len,
811
+ beg, end, 1);
812
+ } else {
813
+ VALUE rtok;
814
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
815
+ return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
816
+ RSTRING_LEN(rtok), beg, end, 1);
817
+ }
818
+ }
819
+
820
+ #else
821
+
822
+ static Token *
823
+ rets_next(TokenStream *ts)
824
+ {
825
+ static struct re_registers regs;
826
+ int ret, beg, end;
827
+ struct RString *rtext = RSTRING(RETS(ts)->rtext);
828
+ long rtext_len = RSTRING_LEN(RETS(ts)->rtext);
829
+ char *rtext_ptr = RSTRING_PTR(RETS(ts)->rtext);
830
+ Check_Type(RETS(ts)->regex, T_REGEXP);
831
+ ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
832
+ rtext_ptr, rtext_len,
833
+ RETS(ts)->curr_ind, rtext_len - RETS(ts)->curr_ind,
834
+ &regs);
835
+
836
+ if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
837
+ if (ret < 0) return NULL; /* not matched */
838
+
839
+ beg = regs.beg[0];
840
+ RETS(ts)->curr_ind = end = regs.end[0];
841
+ if (NIL_P(RETS(ts)->proc)) {
842
+ return tk_set(&(CachedTS(ts)->token), rtext_ptr + beg, end - beg,
843
+ beg, end, 1);
844
+ } else {
845
+ VALUE rtok = rb_str_new(rtext_ptr + beg, end - beg);
846
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
847
+ return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
848
+ RSTRING_LEN(rtok), beg, end, 1);
849
+ }
850
+ }
851
+
852
+ #endif
853
+
854
+
855
+ static TokenStream *
856
+ rets_reset(TokenStream *ts, char *text)
857
+ {
858
+ RETS(ts)->rtext = rb_str_new2(text);
859
+ RETS(ts)->curr_ind = 0;
860
+ return ts;
861
+ }
862
+
863
+ static TokenStream *
864
+ rets_clone_i(TokenStream *orig_ts)
865
+ {
866
+ TokenStream *ts = ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
867
+ return ts;
868
+ }
869
+
870
+ static TokenStream *
871
+ rets_new(VALUE rtext, VALUE regex, VALUE proc)
872
+ {
873
+ TokenStream *ts = ts_new(RegExpTokenStream);
874
+
875
+ if (rtext != Qnil) {
876
+ rtext = StringValue(rtext);
877
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
878
+ }
879
+ ts->reset = &rets_reset;
880
+ ts->next = &rets_next;
881
+ ts->clone_i = &rets_clone_i;
882
+ ts->destroy_i = &rets_destroy_i;
883
+
884
+ RETS(ts)->curr_ind = 0;
885
+ RETS(ts)->rtext = rtext;
886
+ RETS(ts)->proc = proc;
887
+
888
+ if (NIL_P(regex)) {
889
+ RETS(ts)->regex = rtoken_re;
890
+ } else {
891
+ Check_Type(regex, T_REGEXP);
892
+ RETS(ts)->regex = regex;
893
+ }
894
+
895
+ return ts;
896
+ }
897
+
898
+ /*
899
+ * call-seq:
900
+ * RegExpTokenizer.new(input, /[[:alpha:]]+/)
901
+ *
902
+ * Create a new tokenizer based on a regular expression
903
+ *
904
+ * input:: text to tokenizer
905
+ * regexp:: regular expression used to recognize tokens in the input
906
+ */
907
+ static VALUE
908
+ frt_rets_init(int argc, VALUE *argv, VALUE self)
909
+ {
910
+ VALUE rtext, regex, proc;
911
+ TokenStream *ts;
912
+
913
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
914
+
915
+ ts = rets_new(rtext, regex, proc);
916
+
917
+ Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
918
+ object_add(ts, self);
919
+ return self;
920
+ }
921
+
922
+ /****************************************************************************
923
+ * Tokenizers
924
+ ****************************************************************************/
925
+
926
+ #define TS_ARGS(dflt) \
927
+ bool lower;\
928
+ VALUE rlower, rstr;\
929
+ rb_scan_args(argc, argv, "11", &rstr, &rlower);\
930
+ lower = (argc ? RTEST(rlower) : dflt)
931
+
932
+ /*
933
+ * call-seq:
934
+ * AsciiLetterTokenizer.new() -> tokenizer
935
+ *
936
+ * Create a new AsciiLetterTokenizer
937
+ */
938
+ static VALUE
939
+ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
940
+ {
941
+ return get_wrapped_ts(self, rstr, letter_tokenizer_new());
942
+ }
943
+
944
+ /*
945
+ * call-seq:
946
+ * LetterTokenizer.new(lower = true) -> tokenizer
947
+ *
948
+ * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
949
+ * is done according the current locale.
950
+ *
951
+ * lower:: set to false if you don't wish to downcase tokens
952
+ */
953
+ static VALUE
954
+ frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
955
+ {
956
+ TS_ARGS(false);
957
+ #ifndef POSH_OS_WIN32
958
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
959
+ #endif
960
+ return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
961
+ }
962
+
963
+ /*
964
+ * call-seq:
965
+ * AsciiWhiteSpaceTokenizer.new() -> tokenizer
966
+ *
967
+ * Create a new AsciiWhiteSpaceTokenizer
968
+ */
969
+ static VALUE
970
+ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
971
+ {
972
+ return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
973
+ }
974
+
975
+ /*
976
+ * call-seq:
977
+ * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
978
+ *
979
+ * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
980
+ * Downcasing is done according the current locale.
981
+ *
982
+ * lower:: set to false if you don't wish to downcase tokens
983
+ */
984
+ static VALUE
985
+ frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
986
+ {
987
+ TS_ARGS(false);
988
+ #ifndef POSH_OS_WIN32
989
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
990
+ #endif
991
+ return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
992
+ }
993
+
994
+ /*
995
+ * call-seq:
996
+ * AsciiStandardTokenizer.new() -> tokenizer
997
+ *
998
+ * Create a new AsciiStandardTokenizer
999
+ */
1000
+ static VALUE
1001
+ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
1002
+ {
1003
+ return get_wrapped_ts(self, rstr, standard_tokenizer_new());
1004
+ }
1005
+
1006
+ /*
1007
+ * call-seq:
1008
+ * StandardTokenizer.new(lower = true) -> tokenizer
1009
+ *
1010
+ * Create a new StandardTokenizer which optionally downcases tokens.
1011
+ * Downcasing is done according the current locale.
1012
+ *
1013
+ * lower:: set to false if you don't wish to downcase tokens
1014
+ */
1015
+ static VALUE
1016
+ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
1017
+ {
1018
+ #ifndef POSH_OS_WIN32
1019
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1020
+ #endif
1021
+ return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
1022
+ }
1023
+
1024
+ /****************************************************************************
1025
+ * Filters
1026
+ ****************************************************************************/
1027
+
1028
+
1029
+ /*
1030
+ * call-seq:
1031
+ * AsciiLowerCaseFilter.new(token_stream) -> token_stream
1032
+ *
1033
+ * Create an AsciiLowerCaseFilter which normalizes a token's text to
1034
+ * lowercase but only for ASCII characters. For other characters use
1035
+ * LowerCaseFilter.
1036
+ */
1037
+ static VALUE
1038
+ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
1039
+ {
1040
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
1041
+ ts = lowercase_filter_new(ts);
1042
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1043
+
1044
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1045
+ object_add(ts, self);
1046
+ return self;
1047
+ }
1048
+
1049
+ /*
1050
+ * call-seq:
1051
+ * LowerCaseFilter.new(token_stream) -> token_stream
1052
+ *
1053
+ * Create an LowerCaseFilter which normalizes a token's text to
1054
+ * lowercase based on the current locale.
1055
+ */
1056
+ static VALUE
1057
+ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
1058
+ {
1059
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
1060
+ #ifndef POSH_OS_WIN32
1061
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1062
+ #endif
1063
+ ts = mb_lowercase_filter_new(ts);
1064
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1065
+
1066
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1067
+ object_add(ts, self);
1068
+ return self;
1069
+ }
1070
+
1071
+ /*
1072
+ * call-seq:
1073
+ * HyphenFilter.new(token_stream) -> token_stream
1074
+ *
1075
+ * Create an HyphenFilter which filters hyphenated words. The way it works is
1076
+ * by adding both the word concatenated into a single word and split into
1077
+ * multiple words. ie "e-mail" becomes "email" and "e mail". This way a
1078
+ * search for "e-mail", "email" and "mail" will all match. This filter is
1079
+ * used by default by the StandardAnalyzer.
1080
+ */
1081
+ static VALUE
1082
+ frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
1083
+ {
1084
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
1085
+ ts = hyphen_filter_new(ts);
1086
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1087
+
1088
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1089
+ object_add(ts, self);
1090
+ return self;
1091
+ }
1092
+
1093
+ /*
1094
+ * call-seq:
1095
+ * StopFilter.new(token_stream) -> token_stream
1096
+ * StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
1097
+ *
1098
+ * Create an StopFilter which removes *stop-words* from a TokenStream. You can
1099
+ * optionally specify the stopwords you wish to have removed.
1100
+ *
1101
+ * token_stream:: TokenStream to be filtered
1102
+ * stop_words:: Array of *stop-words* you wish to be filtered out. This
1103
+ * defaults to a list of English stop-words. The
1104
+ * Ferret::Analysis contains a number of stop-word lists.
1105
+ */
1106
+ static VALUE
1107
+ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
1108
+ {
1109
+ VALUE rsub_ts, rstop_words;
1110
+ TokenStream *ts;
1111
+ rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
1112
+ ts = frt_get_cwrapped_rts(rsub_ts);
1113
+ if (rstop_words != Qnil) {
1114
+ char **stop_words = get_stopwords(rstop_words);
1115
+ ts = stop_filter_new_with_words(ts, (const char **)stop_words);
1116
+
1117
+ free(stop_words);
1118
+ } else {
1119
+ ts = stop_filter_new(ts);
1120
+ }
1121
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1122
+
1123
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1124
+ object_add(ts, self);
1125
+ return self;
1126
+ }
1127
+
1128
+ static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
1129
+ {
1130
+ switch (TYPE(from)) {
1131
+ case T_STRING:
1132
+ mapping_filter_add(mf, rs2s(from), to);
1133
+ break;
1134
+ case T_SYMBOL:
1135
+ mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
1136
+ break;
1137
+ default:
1138
+ rb_raise(rb_eArgError,
1139
+ "cannot map from %s with MappingFilter",
1140
+ rs2s(rb_obj_as_string(from)));
1141
+ break;
1142
+ }
1143
+ }
1144
+
1145
+ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1146
+ {
1147
+ if (key == Qundef) {
1148
+ return ST_CONTINUE;
1149
+ } else {
1150
+ TokenStream *mf = (TokenStream *)arg;
1151
+ char *to;
1152
+ switch (TYPE(value)) {
1153
+ case T_STRING:
1154
+ to = rs2s(value);
1155
+ break;
1156
+ case T_SYMBOL:
1157
+ to = rb_id2name(SYM2ID(value));
1158
+ break;
1159
+ default:
1160
+ rb_raise(rb_eArgError,
1161
+ "cannot map to %s with MappingFilter",
1162
+ rs2s(rb_obj_as_string(key)));
1163
+ break;
1164
+ }
1165
+ if (TYPE(key) == T_ARRAY) {
1166
+ int i;
1167
+ for (i = RARRAY_LEN(key) - 1; i >= 0; i--) {
1168
+ frt_add_mapping_i(mf, RARRAY_PTR(key)[i], to);
1169
+ }
1170
+ }
1171
+ else {
1172
+ frt_add_mapping_i(mf, key, to);
1173
+ }
1174
+ }
1175
+ return ST_CONTINUE;
1176
+ }
1177
+
1178
+
1179
+ /*
1180
+ * call-seq:
1181
+ * MappingFilter.new(token_stream, mapping) -> token_stream
1182
+ *
1183
+ * Create an MappingFilter which maps strings in tokens. This is usually used
1184
+ * to map UTF-8 characters to ASCII characters for easier searching and
1185
+ * better search recall. The mapping is compiled into a Deterministic Finite
1186
+ * Automata so it is super fast. This Filter can therefor be used for
1187
+ * indexing very large datasets. Currently regular expressions are not
1188
+ * supported. If you are really interested in the feature, please contact me
1189
+ * at dbalmain@gmail.com.
1190
+ *
1191
+ * token_stream:: TokenStream to be filtered
1192
+ * mapping:: Hash of mappings to apply to tokens. The key can be a
1193
+ * String or an Array of Strings. The value must be a String
1194
+ *
1195
+ * == Example
1196
+ *
1197
+ * filt = MappingFilter.new(token_stream,
1198
+ * {
1199
+ * ['à','á','â','ã','ä','å'] => 'a',
1200
+ * ['è','é','ê','ë','ē','ę'] => 'e'
1201
+ * })
1202
+ */
1203
+ static VALUE
1204
+ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1205
+ {
1206
+ TokenStream *ts;
1207
+ ts = frt_get_cwrapped_rts(rsub_ts);
1208
+ ts = mapping_filter_new(ts);
1209
+ rb_hash_foreach(mapping, frt_add_mappings_i, (VALUE)ts);
1210
+ mulmap_compile(((MappingFilter *)ts)->mapper);
1211
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1212
+
1213
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1214
+ object_add(ts, self);
1215
+ return self;
1216
+ }
1217
+
1218
+ /*
1219
+ * call-seq:
1220
+ * StemFilter.new(token_stream) -> token_stream
1221
+ * StemFilter.new(token_stream,
1222
+ * algorithm="english",
1223
+ * encoding="UTF-8") -> token_stream
1224
+ *
1225
+ * Create an StemFilter which uses a snowball stemmer (thank you Martin
1226
+ * Porter) to stem words. You can optionally specify the algorithm (default:
1227
+ * "english") and encoding (default: "UTF-8").
1228
+ *
1229
+ * token_stream:: TokenStream to be filtered
1230
+ * algorithm:: The algorithm (or language) to use
1231
+ * encoding:: The encoding of the data (default: "UTF-8")
1232
+ */
1233
+ static VALUE
1234
+ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
1235
+ {
1236
+ VALUE rsub_ts, ralgorithm, rcharenc;
1237
+ char *algorithm = "english";
1238
+ char *charenc = NULL;
1239
+ TokenStream *ts;
1240
+ rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
1241
+ ts = frt_get_cwrapped_rts(rsub_ts);
1242
+ switch (argc) {
1243
+ case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
1244
+ case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
1245
+ }
1246
+ ts = stem_filter_new(ts, algorithm, charenc);
1247
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1248
+
1249
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1250
+ object_add(ts, self);
1251
+ return self;
1252
+ }
1253
+
1254
+ /****************************************************************************
1255
+ *
1256
+ * Analyzer Methods
1257
+ *
1258
+ ****************************************************************************/
1259
+
1260
+ /****************************************************************************
1261
+ * CWrappedAnalyzer Methods
1262
+ ****************************************************************************/
1263
+
1264
+ #define GET_A(a, self) Data_Get_Struct(self, Analyzer, a)
1265
+
1266
+ #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
1267
+ typedef struct CWrappedAnalyzer
1268
+ {
1269
+ Analyzer super;
1270
+ VALUE ranalyzer;
1271
+ } CWrappedAnalyzer;
1272
+
1273
+ static void
1274
+ cwa_destroy_i(Analyzer *a)
1275
+ {
1276
+ rb_hash_delete(object_space, ((VALUE)a)|1);
1277
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
1278
+ free(a);
1279
+ }
1280
+
1281
+ static TokenStream *
1282
+ cwa_get_ts(Analyzer *a, char *field, char *text)
1283
+ {
1284
+ VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1285
+ ID2SYM(rb_intern(field)), rb_str_new2(text));
1286
+ return frt_get_cwrapped_rts(rts);
1287
+ }
1288
+
1289
+ Analyzer *
1290
+ frt_get_cwrapped_analyzer(VALUE ranalyzer)
1291
+ {
1292
+ Analyzer *a = NULL;
1293
+ if (frt_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
1294
+ Data_Get_Struct(ranalyzer, Analyzer, a);
1295
+ REF(a);
1296
+ }
1297
+ else {
1298
+ a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1299
+ a->destroy_i = &cwa_destroy_i;
1300
+ a->get_ts = &cwa_get_ts;
1301
+ a->ref_cnt = 1;
1302
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1303
+ /* prevent from being garbage collected */
1304
+ rb_hash_aset(object_space, ((VALUE)a)|1, ranalyzer);
1305
+ }
1306
+ return a;
1307
+ }
1308
+
1309
+ static void
1310
+ frt_analyzer_free(Analyzer *a)
1311
+ {
1312
+ object_del(a);
1313
+ a_deref(a);
1314
+ }
1315
+
1316
+ VALUE
1317
+ frt_get_analyzer(Analyzer *a)
1318
+ {
1319
+ VALUE self = Qnil;
1320
+ if (a) {
1321
+ self = object_get(a);
1322
+ if (self == Qnil) {
1323
+ self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
1324
+ REF(a);
1325
+ object_add(a, self);
1326
+ }
1327
+ }
1328
+ return self;
1329
+ }
1330
+
1331
+ INLINE VALUE
1332
+ get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
1333
+ {
1334
+ TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
1335
+
1336
+ /* Make sure that there is no entry already */
1337
+ object_set(&ts->text, rstring);
1338
+ return get_rb_token_stream(ts);
1339
+ }
1340
+
1341
+ /*
1342
+ * call-seq:
1343
+ * analyzer.token_stream(field_name, input) -> token_stream
1344
+ *
1345
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1346
+ * also depend on the +field_name+. Although this parameter is typically
1347
+ * ignored.
1348
+ *
1349
+ * field_name:: name of the field to be tokenized
1350
+ * input:: data from the field to be tokenized
1351
+ */
1352
+ static VALUE
1353
+ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1354
+ {
1355
+ /* NOTE: Any changes made to this method may also need to be applied to
1356
+ * frt_re_analyzer_token_stream */
1357
+ Analyzer *a;
1358
+ GET_A(a, self);
1359
+
1360
+ StringValue(rstring);
1361
+
1362
+ return get_rb_ts_from_a(a, rfield, rstring);
1363
+ }
1364
+
1365
+ #define GET_LOWER(dflt) \
1366
+ bool lower;\
1367
+ VALUE rlower;\
1368
+ rb_scan_args(argc, argv, "01", &rlower);\
1369
+ lower = (argc ? RTEST(rlower) : dflt)
1370
+
1371
+ /*
1372
+ * call-seq:
1373
+ * AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
1374
+ *
1375
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1376
+ * but can optionally leave case as is. Lowercasing will only be done to
1377
+ * ASCII characters.
1378
+ *
1379
+ * lower:: set to false if you don't want the field's tokens to be downcased
1380
+ */
1381
+ static VALUE
1382
+ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1383
+ {
1384
+ Analyzer *a;
1385
+ GET_LOWER(false);
1386
+ a = whitespace_analyzer_new(lower);
1387
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1388
+ object_add(a, self);
1389
+ return self;
1390
+ }
1391
+
1392
+ /*
1393
+ * call-seq:
1394
+ * WhiteSpaceAnalyzer.new(lower = false) -> analyzer
1395
+ *
1396
+ * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1397
+ * optionally leave case as is. Lowercasing will be done based on the current
1398
+ * locale.
1399
+ *
1400
+ * lower:: set to false if you don't want the field's tokens to be downcased
1401
+ */
1402
+ static VALUE
1403
+ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1404
+ {
1405
+ Analyzer *a;
1406
+ GET_LOWER(false);
1407
+ #ifndef POSH_OS_WIN32
1408
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1409
+ #endif
1410
+ a = mb_whitespace_analyzer_new(lower);
1411
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1412
+ object_add(a, self);
1413
+ return self;
1414
+ }
1415
+
1416
+ /*
1417
+ * call-seq:
1418
+ * AsciiLetterAnalyzer.new(lower = true) -> analyzer
1419
+ *
1420
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1421
+ * but can optionally leave case as is. Lowercasing will only be done to
1422
+ * ASCII characters.
1423
+ *
1424
+ * lower:: set to false if you don't want the field's tokens to be downcased
1425
+ */
1426
+ static VALUE
1427
+ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1428
+ {
1429
+ Analyzer *a;
1430
+ GET_LOWER(true);
1431
+ a = letter_analyzer_new(lower);
1432
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1433
+ object_add(a, self);
1434
+ return self;
1435
+ }
1436
+
1437
+ /*
1438
+ * call-seq:
1439
+ * LetterAnalyzer.new(lower = true) -> analyzer
1440
+ *
1441
+ * Create a new LetterAnalyzer which downcases tokens by default but can
1442
+ * optionally leave case as is. Lowercasing will be done based on the current
1443
+ * locale.
1444
+ *
1445
+ * lower:: set to false if you don't want the field's tokens to be downcased
1446
+ */
1447
+ static VALUE
1448
+ frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1449
+ {
1450
+ Analyzer *a;
1451
+ GET_LOWER(true);
1452
+ #ifndef POSH_OS_WIN32
1453
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1454
+ #endif
1455
+ a = mb_letter_analyzer_new(lower);
1456
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1457
+ object_add(a, self);
1458
+ return self;
1459
+ }
1460
+
1461
+ static VALUE
1462
+ get_rstopwords(const char **stop_words)
1463
+ {
1464
+ char **w = (char **)stop_words;
1465
+ VALUE rstopwords = rb_ary_new();
1466
+
1467
+ while (*w) {
1468
+ rb_ary_push(rstopwords, rb_str_new2(*w));
1469
+ w++;
1470
+ }
1471
+ return rstopwords;
1472
+ }
1473
+
1474
+ /*
1475
+ * call-seq:
1476
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
1477
+ * -> analyzer
1478
+ *
1479
+ * Create a new AsciiStandardAnalyzer which downcases tokens by default but
1480
+ * can optionally leave case as is. Lowercasing will be done based on the
1481
+ * current locale. You can also set the list of stop-words to be used by the
1482
+ * StopFilter.
1483
+ *
1484
+ * lower:: set to false if you don't want the field's tokens to be downcased
1485
+ * stop_words:: list of stop-words to pass to the StopFilter
1486
+ */
1487
+ static VALUE
1488
+ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1489
+ {
1490
+ bool lower;
1491
+ VALUE rlower, rstop_words;
1492
+ Analyzer *a;
1493
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1494
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1495
+ if (rstop_words != Qnil) {
1496
+ char **stop_words = get_stopwords(rstop_words);
1497
+ a = standard_analyzer_new_with_words((const char **)stop_words, lower);
1498
+ free(stop_words);
1499
+ } else {
1500
+ a = standard_analyzer_new(lower);
1501
+ }
1502
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1503
+ object_add(a, self);
1504
+ return self;
1505
+ }
1506
+
1507
+ /*
1508
+ * call-seq:
1509
+ * StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
1510
+ * -> analyzer
1511
+ *
1512
+ * Create a new StandardAnalyzer which downcases tokens by default but can
1513
+ * optionally leave case as is. Lowercasing will be done based on the current
1514
+ * locale. You can also set the list of stop-words to be used by the
1515
+ * StopFilter.
1516
+ *
1517
+ * lower:: set to false if you don't want the field's tokens to be downcased
1518
+ * stop_words:: list of stop-words to pass to the StopFilter
1519
+ */
1520
+ static VALUE
1521
+ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1522
+ {
1523
+ bool lower;
1524
+ VALUE rlower, rstop_words;
1525
+ Analyzer *a;
1526
+ #ifndef POSH_OS_WIN32
1527
+ if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1528
+ #endif
1529
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1530
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1531
+ if (rstop_words != Qnil) {
1532
+ char **stop_words = get_stopwords(rstop_words);
1533
+ a = mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
1534
+ free(stop_words);
1535
+ } else {
1536
+ a = mb_standard_analyzer_new(lower);
1537
+ }
1538
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1539
+ object_add(a, self);
1540
+ return self;
1541
+ }
1542
+
1543
+ static void
1544
+ frt_h_mark_values_i(void *key, void *value, void *arg)
1545
+ {
1546
+ frt_gc_mark(value);
1547
+ }
1548
+
1549
+ static void
1550
+ frt_pfa_mark(void *p)
1551
+ {
1552
+ frt_gc_mark(PFA(p)->default_a);
1553
+ h_each(PFA(p)->dict, &frt_h_mark_values_i, NULL);
1554
+ }
1555
+
1556
+ /*** PerFieldAnalyzer ***/
1557
+
1558
+ /*
1559
+ * call-seq:
1560
+ * PerFieldAnalyzer.new(default_analyzer) -> analyzer
1561
+ *
1562
+ * Create a new PerFieldAnalyzer specifying the default analyzer to use on
1563
+ * all fields that are set specifically.
1564
+ *
1565
+ * default_analyzer:: analyzer to be used on fields that aren't otherwise
1566
+ * specified
1567
+ */
1568
+ static VALUE
1569
+ frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1570
+ {
1571
+ Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
1572
+ Analyzer *a = per_field_analyzer_new(def);
1573
+ Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
1574
+ object_add(a, self);
1575
+ return self;
1576
+ }
1577
+
1578
+ /*
1579
+ * call-seq:
1580
+ * per_field_analyzer.add_field(field_name, default_analyzer) -> self
1581
+ * per_field_analyzer[field_name] = default_analyzer -> self
1582
+ *
1583
+ * Set the analyzer to be used on field +field_name+. Note that field_name
1584
+ * should be a symbol.
1585
+ *
1586
+ * field_name:: field we wish to set the analyzer for
1587
+ * analyzer:: analyzer to be used on +field_name+
1588
+ */
1589
+ static VALUE
1590
+ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1591
+ {
1592
+ Analyzer *pfa, *a;
1593
+ Data_Get_Struct(self, Analyzer, pfa);
1594
+ a = frt_get_cwrapped_analyzer(ranalyzer);
1595
+
1596
+ pfa_add_field(pfa, frt_field(rfield), a);
1597
+ return self;
1598
+ }
1599
+
1600
+ /*
1601
+ * call-seq:
1602
+ * analyzer.token_stream(field_name, input) -> token_stream
1603
+ *
1604
+ * Create a new TokenStream to tokenize +input+. The TokenStream created will
1605
+ * also depend on the +field_name+ in the case of the PerFieldAnalyzer.
1606
+ *
1607
+ * field_name:: name of the field to be tokenized
1608
+ * input:: data from the field to be tokenized
1609
+ */
1610
+ static VALUE
1611
+ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1612
+ {
1613
+ Analyzer *pfa, *a;
1614
+ char *field = frt_field(rfield);
1615
+ GET_A(pfa, self);
1616
+
1617
+ StringValue(rstring);
1618
+ a = (Analyzer *)h_get(PFA(pfa)->dict, field);
1619
+ if (a == NULL) {
1620
+ a = PFA(pfa)->default_a;
1621
+ }
1622
+ if (a->get_ts == cwa_get_ts) {
1623
+ return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1624
+ ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
1625
+ }
1626
+ else {
1627
+ return get_rb_ts_from_a(a, rfield, rstring);
1628
+ }
1629
+ }
1630
+
1631
+ /*** RegExpAnalyzer ***/
1632
+
1633
+ static void
1634
+ frt_re_analyzer_mark(Analyzer *a)
1635
+ {
1636
+ frt_gc_mark(a->current_ts);
1637
+ }
1638
+
1639
+ static void
1640
+ re_analyzer_destroy_i(Analyzer *a)
1641
+ {
1642
+ ts_deref(a->current_ts);
1643
+ free(a);
1644
+ }
1645
+
1646
+ /*
1647
+ * call-seq:
1648
+ * RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
1649
+ *
1650
+ * Create a new RegExpAnalyzer which will create tokenizers based on the
1651
+ * regular expression and lowercasing if required.
1652
+ *
1653
+ * reg_exp:: the token matcher for the tokenizer to use
1654
+ * lower:: set to false if you don't want to downcase the tokens
1655
+ */
1656
+ static VALUE
1657
+ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1658
+ {
1659
+ VALUE lower, rets, regex, proc;
1660
+ Analyzer *a;
1661
+ TokenStream *ts;
1662
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1663
+
1664
+ ts = rets_new(Qnil, regex, proc);
1665
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1666
+ object_add(ts, rets);
1667
+
1668
+ if (lower != Qfalse) {
1669
+ rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
1670
+ ts = DATA_PTR(rets);
1671
+ }
1672
+ REF(ts);
1673
+
1674
+ a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1675
+ Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
1676
+ object_add(a, self);
1677
+ return self;
1678
+ }
1679
+
1680
+ /*
1681
+ * call-seq:
1682
+ * analyzer.token_stream(field_name, input) -> token_stream
1683
+ *
1684
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1685
+ * also depend on the +field_name+. Although this parameter is typically
1686
+ * ignored.
1687
+ *
1688
+ * field_name:: name of the field to be tokenized
1689
+ * input:: data from the field to be tokenized
1690
+ */
1691
+ static VALUE
1692
+ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1693
+ {
1694
+ TokenStream *ts;
1695
+ Analyzer *a;
1696
+ GET_A(a, self);
1697
+
1698
+ StringValue(rtext);
1699
+
1700
+ ts = a_get_ts(a, frt_field(rfield), rs2s(rtext));
1701
+
1702
+ /* Make sure that there is no entry already */
1703
+ object_set(&ts->text, rtext);
1704
+ if (ts->next == &rets_next) {
1705
+ RETS(ts)->rtext = rtext;
1706
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
1707
+ }
1708
+ else {
1709
+ RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
1710
+ rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext);
1711
+ }
1712
+ return get_rb_token_stream(ts);
1713
+ }
1714
+
1715
+ /****************************************************************************
1716
+ *
1717
+ * Locale stuff
1718
+ *
1719
+ ****************************************************************************/
1720
+
1721
+ /*
1722
+ * call-seq:
1723
+ * Ferret.locale -> locale_str
1724
+ *
1725
+ * Returns a string corresponding to the locale set. For example;
1726
+ *
1727
+ * puts Ferret.locale #=> "en_US.UTF-8"
1728
+ */
1729
+ static VALUE frt_get_locale(VALUE self, VALUE locale)
1730
+ {
1731
+ return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
1732
+ }
1733
+
1734
+ /*
1735
+ * call-seq:
1736
+ * Ferret.locale = "en_US.UTF-8"
1737
+ *
1738
+ * Set the global locale. You should use this method to set different locales
1739
+ * when indexing documents with different encodings.
1740
+ */
1741
+ static VALUE frt_set_locale(VALUE self, VALUE locale)
1742
+ {
1743
+ char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
1744
+ frt_locale = setlocale(LC_CTYPE, l);
1745
+ return frt_locale ? rb_str_new2(frt_locale) : Qnil;
1746
+ }
1747
+
1748
+ /****************************************************************************
1749
+ *
1750
+ * Init Functions
1751
+ *
1752
+ ****************************************************************************/
1753
+
1754
+ /*
1755
+ * Document-class: Ferret::Analysis::Token
1756
+ *
1757
+ * == Summary
1758
+ *
1759
+ * A Token is an occurrence of a term from the text of a field. It consists
1760
+ * of a term's text and the start and end offset of the term in the text of
1761
+ * the field;
1762
+ *
1763
+ * The start and end offsets permit applications to re-associate a token with
1764
+ * its source text, e.g., to display highlighted query terms in a document
1765
+ * browser, or to show matching text fragments in a KWIC (KeyWord In Context)
1766
+ * display, etc.
1767
+ *
1768
+ * === Attributes
1769
+ *
1770
+ * text:: the terms text which may have been modified by a Token Filter or
1771
+ * Tokenizer from the text originally found in the document
1772
+ * start:: is the position of the first character corresponding to
1773
+ * this token in the source text
1774
+ * end:: is equal to one greater than the position of the last
1775
+ * character corresponding of this token Note that the
1776
+ * difference between @end_offset and @start_offset may not be
1777
+ * equal to @text.length(), as the term text may have been
1778
+ * altered by a stemmer or some other filter.
1779
+ */
1780
+ static void Init_Token(void)
1781
+ {
1782
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1783
+ rb_define_alloc_func(cToken, frt_token_alloc);
1784
+ rb_include_module(cToken, rb_mComparable);
1785
+
1786
+ rb_define_method(cToken, "initialize", frt_token_init, -1);
1787
+ rb_define_method(cToken, "<=>", frt_token_cmp, 1);
1788
+ rb_define_method(cToken, "text", frt_token_get_text, 0);
1789
+ rb_define_method(cToken, "text=", frt_token_set_text, 1);
1790
+ rb_define_method(cToken, "start", frt_token_get_start_offset, 0);
1791
+ rb_define_method(cToken, "start=", frt_token_set_start_offset, 1);
1792
+ rb_define_method(cToken, "end", frt_token_get_end_offset, 0);
1793
+ rb_define_method(cToken, "end=", frt_token_set_end_offset, 1);
1794
+ rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
1795
+ rb_define_method(cToken, "pos_inc=", frt_token_set_pos_inc, 1);
1796
+ rb_define_method(cToken, "to_s", frt_token_to_s, 0);
1797
+ }
1798
+
1799
+ /*
1800
+ * Document-class: Ferret::Analysis::TokenStream
1801
+ *
1802
+ * A TokenStream enumerates the sequence of tokens, either from
1803
+ * fields of a document or from query text.
1804
+ *
1805
+ * This is an abstract class. Concrete subclasses are:
1806
+ *
1807
+ * Tokenizer:: a TokenStream whose input is a string
1808
+ * TokenFilter:: a TokenStream whose input is another TokenStream
1809
+ */
1810
+ static void Init_TokenStream(void)
1811
+ {
1812
+ cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1813
+ frt_mark_cclass(cTokenStream);
1814
+ rb_define_method(cTokenStream, "next", frt_ts_next, 0);
1815
+ rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
1816
+ rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
1817
+ rb_define_method(cTokenStream, "positions", frt_ts_positions, 1);
1818
+ rb_define_method(cTokenStream, "term_offsets", frt_ts_term_offsets, 1);
1819
+ }
1820
+
1821
+ /*
1822
+ * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1823
+ *
1824
+ * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1825
+ * That is to say, it defines tokens as maximal strings of adjacent letters,
1826
+ * as defined by the regular expression _/[A-Za-z]+/_.
1827
+ *
1828
+ * === Example
1829
+ *
1830
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1831
+ * => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
1832
+ */
1833
+ static void Init_AsciiLetterTokenizer(void)
1834
+ {
1835
+ cAsciiLetterTokenizer =
1836
+ rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1837
+ frt_mark_cclass(cAsciiLetterTokenizer);
1838
+ rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
1839
+ rb_define_method(cAsciiLetterTokenizer, "initialize",
1840
+ frt_a_letter_tokenizer_init, 1);
1841
+ }
1842
+
1843
+ /*
1844
+ * Document-class: Ferret::Analysis::LetterTokenizer
1845
+ *
1846
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That is
1847
+ * to say, it defines tokens as maximal strings of adjacent letters, as
1848
+ * defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
1849
+ * all characters in your local locale.
1850
+ *
1851
+ * === Example
1852
+ *
1853
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1854
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1855
+ */
1856
+ static void Init_LetterTokenizer(void)
1857
+ {
1858
+ cLetterTokenizer =
1859
+ rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1860
+ frt_mark_cclass(cLetterTokenizer);
1861
+ rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
1862
+ rb_define_method(cLetterTokenizer, "initialize",
1863
+ frt_letter_tokenizer_init, -1);
1864
+ }
1865
+
1866
+ /*
1867
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1868
+ *
1869
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1870
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1871
+ *
1872
+ * === Example
1873
+ *
1874
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1875
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1876
+ */
1877
+ static void Init_AsciiWhiteSpaceTokenizer(void)
1878
+ {
1879
+ cAsciiWhiteSpaceTokenizer =
1880
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1881
+ cTokenStream);
1882
+ frt_mark_cclass(cAsciiWhiteSpaceTokenizer);
1883
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
1884
+ rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1885
+ frt_a_whitespace_tokenizer_init, 1);
1886
+ }
1887
+
1888
+ /*
1889
+ * Document-class: Ferret::Analysis::WhiteSpaceTokenizer
1890
+ *
1891
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1892
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1893
+ *
1894
+ * === Example
1895
+ *
1896
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1897
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1898
+ */
1899
+ static void Init_WhiteSpaceTokenizer(void)
1900
+ {
1901
+ cWhiteSpaceTokenizer =
1902
+ rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1903
+ frt_mark_cclass(cWhiteSpaceTokenizer);
1904
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1905
+ rb_define_method(cWhiteSpaceTokenizer, "initialize",
1906
+ frt_whitespace_tokenizer_init, -1);
1907
+ }
1908
+
1909
+ /*
1910
+ * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1911
+ *
1912
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1913
+ * words correctly as well as tokenizing things like email addresses, web
1914
+ * addresses, phone numbers, etc.
1915
+ *
1916
+ * === Example
1917
+ *
1918
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1919
+ * => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
1920
+ */
1921
+ static void Init_AsciiStandardTokenizer(void)
1922
+ {
1923
+ cAsciiStandardTokenizer =
1924
+ rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1925
+ frt_mark_cclass(cAsciiStandardTokenizer);
1926
+ rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1927
+ rb_define_method(cAsciiStandardTokenizer, "initialize",
1928
+ frt_a_standard_tokenizer_init, 1);
1929
+ }
1930
+
1931
+ /*
1932
+ * Document-class: Ferret::Analysis::StandardTokenizer
1933
+ *
1934
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1935
+ * words correctly as well as tokenizing things like email addresses, web
1936
+ * addresses, phone numbers, etc.
1937
+ *
1938
+ * === Example
1939
+ *
1940
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1941
+ * => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
1942
+ */
1943
+ static void Init_StandardTokenizer(void)
1944
+ {
1945
+ cStandardTokenizer =
1946
+ rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1947
+ frt_mark_cclass(cStandardTokenizer);
1948
+ rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1949
+ rb_define_method(cStandardTokenizer, "initialize",
1950
+ frt_standard_tokenizer_init, 1);
1951
+ }
1952
+
1953
+ /*
1954
+ * Document-class: Ferret::Analysis::RegExpTokenizer
1955
+ *
1956
+ * A tokenizer that recognizes tokens based on a regular expression passed to
1957
+ * the constructor. Most possible tokenizers can be created using this class.
1958
+ *
1959
+ * === Example
1960
+ *
1961
+ * Below is an example of a simple implementation of a LetterTokenizer using
1962
+ * an RegExpTokenizer. Basically, a token is a sequence of alphabetic
1963
+ * characters separated by one or more non-alphabetic characters.
1964
+ *
1965
+ * # of course you would add more than just é
1966
+ * RegExpTokenizer.new(input, /[[:alpha:]é]+/)
1967
+ *
1968
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1969
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1970
+ */
1971
+ static void Init_RegExpTokenizer(void)
1972
+ {
1973
+ cRegExpTokenizer =
1974
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1975
+ frt_mark_cclass(cRegExpTokenizer);
1976
+ rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1977
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1978
+ rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1979
+ rb_define_method(cRegExpTokenizer, "initialize",
1980
+ frt_rets_init, -1);
1981
+ rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1982
+ rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1983
+ }
1984
+
1985
+ /***************/
1986
+ /*** Filters ***/
1987
+ /***************/
1988
+
1989
+ /*
1990
+ * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1991
+ *
1992
+ * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1993
+ * ASCII characters. For other characters use LowerCaseFilter.
1994
+ *
1995
+ * === Example
1996
+ *
1997
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
1998
+ *
1999
+ */
2000
+ static void Init_AsciiLowerCaseFilter(void)
2001
+ {
2002
+ cAsciiLowerCaseFilter =
2003
+ rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
2004
+ frt_mark_cclass(cAsciiLowerCaseFilter);
2005
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
2006
+ rb_define_method(cAsciiLowerCaseFilter, "initialize",
2007
+ frt_a_lowercase_filter_init, 1);
2008
+ }
2009
+
2010
+ /*
2011
+ * Document-class: Ferret::Analysis::LowerCaseFilter
2012
+ *
2013
+ * LowerCaseFilter normalizes a token's text to lowercase based on the
2014
+ * current locale.
2015
+ *
2016
+ * === Example
2017
+ *
2018
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
2019
+ *
2020
+ */
2021
+ static void Init_LowerCaseFilter(void)
2022
+ {
2023
+ cLowerCaseFilter =
2024
+ rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
2025
+ frt_mark_cclass(cLowerCaseFilter);
2026
+ rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
2027
+ rb_define_method(cLowerCaseFilter, "initialize",
2028
+ frt_lowercase_filter_init, 1);
2029
+ }
2030
+
2031
+ /*
2032
+ * Document-class: Ferret::Analysis::HyphenFilter
2033
+ *
2034
+ * HyphenFilter filters hyphenated words by adding both the word concatenated
2035
+ * into a single word and split into multiple words. ie "e-mail" becomes
2036
+ * "email" and "e mail". This way a search for "e-mail", "email" and "mail"
2037
+ * will all match. This filter is used by default by the StandardAnalyzer.
2038
+ *
2039
+ * === Example
2040
+ *
2041
+ * ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
2042
+ *
2043
+ */
2044
+ static void Init_HyphenFilter(void)
2045
+ {
2046
+ cHyphenFilter =
2047
+ rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
2048
+ frt_mark_cclass(cHyphenFilter);
2049
+ rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
2050
+ rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
2051
+ }
2052
+
2053
+ /*
2054
+ * Document-class: Ferret::Analysis::MappingFilter
2055
+ *
2056
+ * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
2057
+ * characters to ASCII characters for easier searching and better search
2058
+ * recall. The mapping is compiled into a Deterministic Finite Automata so it
2059
+ * is super fast. This Filter can therefor be used for indexing very large
2060
+ * datasets. Currently regular expressions are not supported. If you are
2061
+ * really interested in the feature, please contact me at dbalmain@gmail.com.
2062
+ *
2063
+ * == Example
2064
+ *
2065
+ * mapping = {
2066
+ * ['à','á','â','ã','ä','å','ā','ă'] => 'a',
2067
+ * 'æ' => 'ae',
2068
+ * ['ď','đ'] => 'd',
2069
+ * ['ç','ć','č','ĉ','ċ'] => 'c',
2070
+ * ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
2071
+ * ['ƒ'] => 'f',
2072
+ * ['ĝ','ğ','ġ','ģ'] => 'g',
2073
+ * ['ĥ','ħ'] => 'h',
2074
+ * ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
2075
+ * ['į','ı','ij','ĵ'] => 'j',
2076
+ * ['ķ','ĸ'] => 'k',
2077
+ * ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
2078
+ * ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
2079
+ * ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
2080
+ * ['œ'] => 'oek',
2081
+ * ['ą'] => 'q',
2082
+ * ['ŕ','ř','ŗ'] => 'r',
2083
+ * ['ś','š','ş','ŝ','ș'] => 's',
2084
+ * ['ť','ţ','ŧ','ț'] => 't',
2085
+ * ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
2086
+ * ['ŵ'] => 'w',
2087
+ * ['ý','ÿ','ŷ'] => 'y',
2088
+ * ['ž','ż','ź'] => 'z'
2089
+ * }
2090
+ * filt = MappingFilter.new(token_stream, mapping)
2091
+ */
2092
+ static void Init_MappingFilter(void)
2093
+ {
2094
+ cMappingFilter =
2095
+ rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
2096
+ frt_mark_cclass(cMappingFilter);
2097
+ rb_define_alloc_func(cMappingFilter, frt_data_alloc);
2098
+ rb_define_method(cMappingFilter, "initialize",
2099
+ frt_mapping_filter_init, 2);
2100
+ }
2101
+
2102
+ /*
2103
+ * Document-class: Ferret::Analysis::StopFilter
2104
+ *
2105
+ * A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
2106
+ * that you don't wish to be index. Usually they will be common words like
2107
+ * "the" and "and" although you can specify whichever words you want.
2108
+ *
2109
+ * === Example
2110
+ *
2111
+ * ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
2112
+ */
2113
+ static void Init_StopFilter(void)
2114
+ {
2115
+ cStopFilter =
2116
+ rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
2117
+ frt_mark_cclass(cStopFilter);
2118
+ rb_define_alloc_func(cStopFilter, frt_data_alloc);
2119
+ rb_define_method(cStopFilter, "initialize",
2120
+ frt_stop_filter_init, -1);
2121
+ }
2122
+
2123
+ /*
2124
+ * Document-class: Ferret::Analysis::StemFilter
2125
+ *
2126
+ * == Summary
2127
+ *
2128
+ * A StemFilter takes a term and transforms the term as per the SnowBall
2129
+ * stemming algorithm. Note: the input to the stemming filter must already
2130
+ * be in lower case, so you will need to use LowerCaseFilter or lowercasing
2131
+ * Tokenizer further down the Tokenizer chain in order for this to work
2132
+ * properly!
2133
+ *
2134
+ * === Available algorithms and encodings
2135
+ *
2136
+ * Algorithm Algorithm Pseudonyms Encoding
2137
+ * ----------------------------------------------------------------
2138
+ * "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
2139
+ * "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
2140
+ * "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
2141
+ * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
2142
+ * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
2143
+ * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
2144
+ * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
2145
+ * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
2146
+ * "porter", | | "ISO_8859_1", "UTF_8"
2147
+ * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
2148
+ * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
2149
+ * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
2150
+ * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
2151
+ *
2152
+ * === Example
2153
+ *
2154
+ * To use this filter with other analyzers, you'll want to write an Analyzer
2155
+ * class that sets up the TokenStream chain as you want it. To use this with
2156
+ * a lowercasing Tokenizer, for example, you'd write an analyzer like this:
2157
+ *
2158
+ * def MyAnalyzer < Analyzer
2159
+ * def token_stream(field, str)
2160
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2161
+ * end
2162
+ * end
2163
+ *
2164
+ * "debate debates debated debating debater"
2165
+ * => ["debat", "debat", "debat", "debat", "debat"]
2166
+ *
2167
+ * === Attributes
2168
+ *
2169
+ * token_stream:: TokenStream to be filtered
2170
+ * algorithm:: The algorithm (or language) to use (default: "english")
2171
+ * encoding:: The encoding of the data (default: "UTF-8")
2172
+ */
2173
+ static void Init_StemFilter(void)
2174
+ {
2175
+ cStemFilter =
2176
+ rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
2177
+ frt_mark_cclass(cStemFilter);
2178
+ rb_define_alloc_func(cStemFilter, frt_data_alloc);
2179
+ rb_define_method(cStemFilter, "initialize",
2180
+ frt_stem_filter_init, -1);
2181
+ }
2182
+
2183
+ /*************************/
2184
+ /*** * * Analyzers * * ***/
2185
+ /*************************/
2186
+
2187
+ /*
2188
+ * Document-class: Ferret::Analysis::Analyzer
2189
+ *
2190
+ * == Summary
2191
+ *
2192
+ * An Analyzer builds TokenStreams, which analyze text. It thus represents
2193
+ * a policy for extracting index terms from text.
2194
+ *
2195
+ * Typical implementations first build a Tokenizer, which breaks the stream
2196
+ * of characters from the Reader into raw Tokens. One or more TokenFilters
2197
+ * may then be applied to the output of the Tokenizer.
2198
+ *
2199
+ * The default Analyzer just creates a LowerCaseTokenizer which converts
2200
+ * all text to lowercase tokens. See LowerCaseTokenizer for more details.
2201
+ *
2202
+ * === Example
2203
+ *
2204
+ * To create your own custom Analyzer you simply need to implement a
2205
+ * token_stream method which takes the field name and the data to be
2206
+ * tokenized as parameters and returns a TokenStream. Most analyzers
2207
+ * typically ignore the field name.
2208
+ *
2209
+ * Here we'll create a StemmingAnalyzer;
2210
+ *
2211
+ * def MyAnalyzer < Analyzer
2212
+ * def token_stream(field, str)
2213
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2214
+ * end
2215
+ * end
2216
+ */
2217
+ static void Init_Analyzer(void)
2218
+ {
2219
+ cAnalyzer =
2220
+ rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
2221
+ frt_mark_cclass(cAnalyzer);
2222
+ rb_define_alloc_func(cAnalyzer, frt_data_alloc);
2223
+ rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
2224
+ rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
2225
+ }
2226
+
2227
+ /*
2228
+ * Document-class: Ferret::Analysis::AsciiLetterAnalyzer
2229
+ *
2230
+ * == Summary
2231
+ *
2232
+ * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
2233
+ * maximal strings of ASCII characters. If implemented in Ruby it would look
2234
+ * like;
2235
+ *
2236
+ * class AsciiLetterAnalyzer
2237
+ * def initialize(lower = true)
2238
+ * @lower = lower
2239
+ * end
2240
+ *
2241
+ * def token_stream(field, str)
2242
+ * if @lower
2243
+ * return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
2244
+ * else
2245
+ * return AsciiLetterTokenizer.new(str)
2246
+ * end
2247
+ * end
2248
+ * end
2249
+ *
2250
+ * As you can see it makes use of the AsciiLetterTokenizer and
2251
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
2252
+ * characters so you should use the LetterAnalyzer is you want to analyze
2253
+ * multi-byte data like "UTF-8".
2254
+ */
2255
+ static void Init_AsciiLetterAnalyzer(void)
2256
+ {
2257
+ cAsciiLetterAnalyzer =
2258
+ rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
2259
+ frt_mark_cclass(cAsciiLetterAnalyzer);
2260
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
2261
+ rb_define_method(cAsciiLetterAnalyzer, "initialize",
2262
+ frt_a_letter_analyzer_init, -1);
2263
+ }
2264
+
2265
+ /*
2266
+ * Document-class: Ferret::Analysis::LetterAnalyzer
2267
+ *
2268
+ * == Summary
2269
+ *
2270
+ * A LetterAnalyzer creates a TokenStream that splits the input up into
2271
+ * maximal strings of characters as recognized by the current locale. If
2272
+ * implemented in Ruby it would look like;
2273
+ *
2274
+ * class LetterAnalyzer
2275
+ * def initialize(lower = true)
2276
+ * @lower = lower
2277
+ * end
2278
+ *
2279
+ * def token_stream(field, str)
2280
+ * return LetterTokenizer.new(str, @lower)
2281
+ * end
2282
+ * end
2283
+ *
2284
+ * As you can see it makes use of the LetterTokenizer.
2285
+ */
2286
+ static void Init_LetterAnalyzer(void)
2287
+ {
2288
+ cLetterAnalyzer =
2289
+ rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
2290
+ frt_mark_cclass(cLetterAnalyzer);
2291
+ rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
2292
+ rb_define_method(cLetterAnalyzer, "initialize",
2293
+ frt_letter_analyzer_init, -1);
2294
+ }
2295
+
2296
+ /*
2297
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
2298
+ *
2299
+ * == Summary
2300
+ *
2301
+ * The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
2302
+ * non-whitespace characters. If implemented in Ruby the
2303
+ * AsciiWhiteSpaceAnalyzer would look like;
2304
+ *
2305
+ * class AsciiWhiteSpaceAnalyzer
2306
+ * def initialize(lower = true)
2307
+ * @lower = lower
2308
+ * end
2309
+ *
2310
+ * def token_stream(field, str)
2311
+ * if @lower
2312
+ * return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
2313
+ * else
2314
+ * return AsciiWhiteSpaceTokenizer.new(str)
2315
+ * end
2316
+ * end
2317
+ * end
2318
+ *
2319
+ * As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
2320
+ * use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
2321
+ * as "UTF-8".
2322
+ */
2323
+ static void Init_AsciiWhiteSpaceAnalyzer(void)
2324
+ {
2325
+ cAsciiWhiteSpaceAnalyzer =
2326
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
2327
+ frt_mark_cclass(cAsciiWhiteSpaceAnalyzer);
2328
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
2329
+ rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
2330
+ frt_a_white_space_analyzer_init, -1);
2331
+ }
2332
+
2333
+ /*
2334
+ * Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
2335
+ *
2336
+ * == Summary
2337
+ *
2338
+ * The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
2339
+ * non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
2340
+ * would look like;
2341
+ *
2342
+ * class WhiteSpaceAnalyzer
2343
+ * def initialize(lower = true)
2344
+ * @lower = lower
2345
+ * end
2346
+ *
2347
+ * def token_stream(field, str)
2348
+ * return WhiteSpaceTokenizer.new(str, @lower)
2349
+ * end
2350
+ * end
2351
+ *
2352
+ * As you can see it makes use of the WhiteSpaceTokenizer.
2353
+ */
2354
+ static void Init_WhiteSpaceAnalyzer(void)
2355
+ {
2356
+ cWhiteSpaceAnalyzer =
2357
+ rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
2358
+ frt_mark_cclass(cWhiteSpaceAnalyzer);
2359
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
2360
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize",
2361
+ frt_white_space_analyzer_init, -1);
2362
+ }
2363
+
2364
+ /*
2365
+ * Document-class: Ferret::Analysis::AsciiStandardAnalyzer
2366
+ *
2367
+ * == Summary
2368
+ *
2369
+ * The AsciiStandardAnalyzer is the most advanced of the available
2370
+ * ASCII-analyzers. If it were implemented in Ruby it would look like this;
2371
+ *
2372
+ * class AsciiStandardAnalyzer
2373
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2374
+ * @lower = lower
2375
+ * @stop_words = stop_words
2376
+ * end
2377
+ *
2378
+ * def token_stream(field, str)
2379
+ * ts = AsciiStandardTokenizer.new(str)
2380
+ * ts = AsciiLowerCaseFilter.new(ts) if @lower
2381
+ * ts = StopFilter.new(ts, @stop_words)
2382
+ * ts = HyphenFilter.new(ts)
2383
+ * end
2384
+ * end
2385
+ *
2386
+ * As you can see it makes use of the AsciiStandardTokenizer and you can also
2387
+ * add your own list of stop-words if you wish. Note that this tokenizer
2388
+ * won't recognize non-ASCII characters so you should use the
2389
+ * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
2390
+ */
2391
+ static void Init_AsciiStandardAnalyzer(void)
2392
+ {
2393
+ cAsciiStandardAnalyzer =
2394
+ rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2395
+ frt_mark_cclass(cAsciiStandardAnalyzer);
2396
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
2397
+ rb_define_method(cAsciiStandardAnalyzer, "initialize",
2398
+ frt_a_standard_analyzer_init, -1);
2399
+ }
2400
+
2401
+ /*
2402
+ * Document-class: Ferret::Analysis::StandardAnalyzer
2403
+ *
2404
+ * == Summary
2405
+ *
2406
+ * The StandardAnalyzer is the most advanced of the available analyzers. If
2407
+ * it were implemented in Ruby it would look like this;
2408
+ *
2409
+ * class StandardAnalyzer
2410
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2411
+ * @lower = lower
2412
+ * @stop_words = stop_words
2413
+ * end
2414
+ *
2415
+ * def token_stream(field, str)
2416
+ * ts = StandardTokenizer.new(str)
2417
+ * ts = LowerCaseFilter.new(ts) if @lower
2418
+ * ts = StopFilter.new(ts, @stop_words)
2419
+ * ts = HyphenFilter.new(ts)
2420
+ * end
2421
+ * end
2422
+ *
2423
+ * As you can see it makes use of the StandardTokenizer and you can also add
2424
+ * your own list of stopwords if you wish.
2425
+ */
2426
+ static void Init_StandardAnalyzer(void)
2427
+ {
2428
+ cStandardAnalyzer =
2429
+ rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2430
+ frt_mark_cclass(cStandardAnalyzer);
2431
+ rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
2432
+ rb_define_method(cStandardAnalyzer, "initialize",
2433
+ frt_standard_analyzer_init, -1);
2434
+ }
2435
+
2436
+ /*
2437
+ * Document-class: Ferret::Analysis::PerFieldAnalyzer
2438
+ *
2439
+ * == Summary
2440
+ *
2441
+ * The PerFieldAnalyzer is for use when you want to analyze different fields
2442
+ * with different analyzers. With the PerFieldAnalyzer you can specify how
2443
+ * you want each field analyzed.
2444
+ *
2445
+ * === Example
2446
+ *
2447
+ * # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
2448
+ * pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
2449
+ *
2450
+ * # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
2451
+ * pfa[:title] = WhiteSpaceAnalyzer.new(false)
2452
+ *
2453
+ * # Use a custom analyzer on the :created_at field
2454
+ * pfa[:created_at] = DateAnalyzer.new
2455
+ */
2456
+ static void Init_PerFieldAnalyzer(void)
2457
+ {
2458
+ cPerFieldAnalyzer =
2459
+ rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2460
+ frt_mark_cclass(cPerFieldAnalyzer);
2461
+ rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
2462
+ rb_define_method(cPerFieldAnalyzer, "initialize",
2463
+ frt_per_field_analyzer_init, 1);
2464
+ rb_define_method(cPerFieldAnalyzer, "add_field",
2465
+ frt_per_field_analyzer_add_field, 2);
2466
+ rb_define_method(cPerFieldAnalyzer, "[]=",
2467
+ frt_per_field_analyzer_add_field, 2);
2468
+ rb_define_method(cPerFieldAnalyzer, "token_stream",
2469
+ frt_pfa_analyzer_token_stream, 2);
2470
+ }
2471
+
2472
+ /*
2473
+ * Document-class: Ferret::Analysis::RegExpAnalyzer
2474
+ *
2475
+ * == Summary
2476
+ *
2477
+ * Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
2478
+ * implemented in Ruby it would look like this;
2479
+ *
2480
+ * class RegExpAnalyzer
2481
+ * def initialize(reg_exp, lower = true)
2482
+ * @lower = lower
2483
+ * @reg_exp = reg_exp
2484
+ * end
2485
+ *
2486
+ * def token_stream(field, str)
2487
+ * if @lower
2488
+ * return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
2489
+ * else
2490
+ * return RegExpTokenizer.new(str, reg_exp)
2491
+ * end
2492
+ * end
2493
+ * end
2494
+ *
2495
+ * === Example
2496
+ *
2497
+ * csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
2498
+ */
2499
+ static void Init_RegExpAnalyzer(void)
2500
+ {
2501
+ cRegExpAnalyzer =
2502
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2503
+ frt_mark_cclass(cRegExpAnalyzer);
2504
+ rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2505
+ rb_define_method(cRegExpAnalyzer, "initialize",
2506
+ frt_re_analyzer_init, -1);
2507
+ rb_define_method(cRegExpAnalyzer, "token_stream",
2508
+ frt_re_analyzer_token_stream, 2);
2509
+ }
2510
+
2511
+ /* rdoc hack
2512
+ extern VALUE mFerret = rb_define_module("Ferret");
2513
+ */
2514
+
2515
+ /*
2516
+ * Document-module: Ferret::Analysis
2517
+ *
2518
+ * == Summary
2519
+ *
2520
+ * The Analysis module contains all the classes used to analyze and tokenize
2521
+ * the data to be indexed. There are three main classes you need to know
2522
+ * about when dealing with analysis; Analyzer, TokenStream and Token.
2523
+ *
2524
+ * == Classes
2525
+ *
2526
+ * === Analyzer
2527
+ *
2528
+ * Analyzers handle all of your tokenizing needs. You pass an Analyzer to the
2529
+ * indexing class when you create it and it will create the TokenStreams
2530
+ * necessary to tokenize the fields in the documents. Most of the time you
2531
+ * won't need to worry about TokenStreams and Tokens, one of the Analyzers
2532
+ * distributed with Ferret will do exactly what you need. Otherwise you'll
2533
+ * need to implement a custom analyzer.
2534
+ *
2535
+ * === TokenStream
2536
+ *
2537
+ * A TokenStream is an enumeration of Tokens. There are two standard types of
2538
+ * TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
2539
+ * turns it into a list of Tokens. A TokenFilter takes another TokenStream
2540
+ * and post-processes the Tokens. You can chain as many TokenFilters together
2541
+ * as you like but they always need to finish with a Tokenizer.
2542
+ *
2543
+ * === Token
2544
+ *
2545
+ * A Token is a single term from a document field. A token contains the text
2546
+ * representing the term as well as the start and end offset of the token.
2547
+ * The start and end offset will represent the token as it appears in the
2548
+ * source field. Some TokenFilters may change the text in the Token but the
2549
+ * start and end offsets should stay the same so (end - start) won't
2550
+ * necessarily be equal to the length of text in the token. For example using
2551
+ * a stemming TokenFilter the term "Beginning" might have start and end
2552
+ * offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
2553
+ * might be "begin" (after stemming).
2554
+ */
2555
+ void
2556
+ Init_Analysis(void)
2557
+ {
2558
+ mAnalysis = rb_define_module_under(mFerret, "Analysis");
2559
+
2560
+ /* TokenStream Methods */
2561
+ id_next = rb_intern("next");
2562
+ id_reset = rb_intern("text=");
2563
+ id_clone = rb_intern("clone");
2564
+ id_text = rb_intern("@text");
2565
+
2566
+ /* Analyzer Methods */
2567
+ id_token_stream = rb_intern("token_stream");
2568
+
2569
+ object_space = rb_hash_new();
2570
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2571
+
2572
+ /*** * * Locale stuff * * ***/
2573
+ rb_define_singleton_method(mFerret, "locale=", frt_set_locale, 1);
2574
+ rb_define_singleton_method(mFerret, "locale", frt_get_locale, 0);
2575
+
2576
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2577
+ get_rstopwords(ENGLISH_STOP_WORDS));
2578
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
2579
+ get_rstopwords(FULL_ENGLISH_STOP_WORDS));
2580
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
2581
+ get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
2582
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
2583
+ get_rstopwords(FULL_FRENCH_STOP_WORDS));
2584
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
2585
+ get_rstopwords(FULL_SPANISH_STOP_WORDS));
2586
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
2587
+ get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
2588
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
2589
+ get_rstopwords(FULL_ITALIAN_STOP_WORDS));
2590
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
2591
+ get_rstopwords(FULL_GERMAN_STOP_WORDS));
2592
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
2593
+ get_rstopwords(FULL_DUTCH_STOP_WORDS));
2594
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
2595
+ get_rstopwords(FULL_SWEDISH_STOP_WORDS));
2596
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
2597
+ get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
2598
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
2599
+ get_rstopwords(FULL_DANISH_STOP_WORDS));
2600
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
2601
+ get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
2602
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2603
+ get_rstopwords(FULL_FINNISH_STOP_WORDS));
2604
+
2605
+ Init_Token();
2606
+ Init_TokenStream();
2607
+
2608
+ Init_AsciiLetterTokenizer();
2609
+ Init_LetterTokenizer();
2610
+
2611
+ Init_AsciiWhiteSpaceTokenizer();
2612
+ Init_WhiteSpaceTokenizer();
2613
+
2614
+ Init_AsciiStandardTokenizer();
2615
+ Init_StandardTokenizer();
2616
+
2617
+ Init_RegExpTokenizer();
2618
+
2619
+ Init_AsciiLowerCaseFilter();
2620
+ Init_LowerCaseFilter();
2621
+ Init_HyphenFilter();
2622
+ Init_StopFilter();
2623
+ Init_MappingFilter();
2624
+ Init_StemFilter();
2625
+
2626
+ Init_Analyzer();
2627
+ Init_AsciiLetterAnalyzer();
2628
+ Init_LetterAnalyzer();
2629
+ Init_AsciiWhiteSpaceAnalyzer();
2630
+ Init_WhiteSpaceAnalyzer();
2631
+ Init_AsciiStandardAnalyzer();
2632
+ Init_StandardAnalyzer();
2633
+ Init_PerFieldAnalyzer();
2634
+ Init_RegExpAnalyzer();
2635
+
2636
+ }