isomorfeus-ferret 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (222) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +612 -0
  3. data/README.md +44 -0
  4. data/ext/isomorfeus_ferret_ext/benchmark.c +223 -0
  5. data/ext/isomorfeus_ferret_ext/benchmark.h +45 -0
  6. data/ext/isomorfeus_ferret_ext/benchmarks_all.h +25 -0
  7. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +123 -0
  8. data/ext/isomorfeus_ferret_ext/bm_hash.c +118 -0
  9. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +40 -0
  10. data/ext/isomorfeus_ferret_ext/bm_store.c +93 -0
  11. data/ext/isomorfeus_ferret_ext/email.rl +21 -0
  12. data/ext/isomorfeus_ferret_ext/extconf.rb +5 -0
  13. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -0
  14. data/ext/isomorfeus_ferret_ext/frb_analysis.c +2577 -0
  15. data/ext/isomorfeus_ferret_ext/frb_index.c +3457 -0
  16. data/ext/isomorfeus_ferret_ext/frb_lang.c +9 -0
  17. data/ext/isomorfeus_ferret_ext/frb_lang.h +17 -0
  18. data/ext/isomorfeus_ferret_ext/frb_qparser.c +629 -0
  19. data/ext/isomorfeus_ferret_ext/frb_search.c +4460 -0
  20. data/ext/isomorfeus_ferret_ext/frb_store.c +515 -0
  21. data/ext/isomorfeus_ferret_ext/frb_threading.h +30 -0
  22. data/ext/isomorfeus_ferret_ext/frb_utils.c +1127 -0
  23. data/ext/isomorfeus_ferret_ext/frt_analysis.c +1644 -0
  24. data/ext/isomorfeus_ferret_ext/frt_analysis.h +247 -0
  25. data/ext/isomorfeus_ferret_ext/frt_array.c +124 -0
  26. data/ext/isomorfeus_ferret_ext/frt_array.h +54 -0
  27. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +95 -0
  28. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +586 -0
  29. data/ext/isomorfeus_ferret_ext/frt_compound_io.c +374 -0
  30. data/ext/isomorfeus_ferret_ext/frt_config.h +44 -0
  31. data/ext/isomorfeus_ferret_ext/frt_document.c +134 -0
  32. data/ext/isomorfeus_ferret_ext/frt_document.h +52 -0
  33. data/ext/isomorfeus_ferret_ext/frt_except.c +95 -0
  34. data/ext/isomorfeus_ferret_ext/frt_except.h +188 -0
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +233 -0
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +42 -0
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +157 -0
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +502 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +427 -0
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +290 -0
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +518 -0
  42. data/ext/isomorfeus_ferret_ext/frt_hash.h +466 -0
  43. data/ext/isomorfeus_ferret_ext/frt_hashset.c +191 -0
  44. data/ext/isomorfeus_ferret_ext/frt_hashset.h +206 -0
  45. data/ext/isomorfeus_ferret_ext/frt_helper.c +62 -0
  46. data/ext/isomorfeus_ferret_ext/frt_helper.h +13 -0
  47. data/ext/isomorfeus_ferret_ext/frt_ind.c +353 -0
  48. data/ext/isomorfeus_ferret_ext/frt_ind.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/frt_index.c +6377 -0
  50. data/ext/isomorfeus_ferret_ext/frt_index.h +880 -0
  51. data/ext/isomorfeus_ferret_ext/frt_lang.c +104 -0
  52. data/ext/isomorfeus_ferret_ext/frt_lang.h +44 -0
  53. data/ext/isomorfeus_ferret_ext/frt_mempool.c +87 -0
  54. data/ext/isomorfeus_ferret_ext/frt_mempool.h +33 -0
  55. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +349 -0
  56. data/ext/isomorfeus_ferret_ext/frt_multimapper.h +52 -0
  57. data/ext/isomorfeus_ferret_ext/frt_posh.c +1006 -0
  58. data/ext/isomorfeus_ferret_ext/frt_posh.h +973 -0
  59. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.c +147 -0
  60. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.h +147 -0
  61. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1612 -0
  62. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +157 -0
  63. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +209 -0
  64. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +281 -0
  65. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +147 -0
  66. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +672 -0
  67. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +3084 -0
  68. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +1182 -0
  69. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +98 -0
  70. data/ext/isomorfeus_ferret_ext/frt_q_range.c +665 -0
  71. data/ext/isomorfeus_ferret_ext/frt_q_span.c +2386 -0
  72. data/ext/isomorfeus_ferret_ext/frt_q_term.c +311 -0
  73. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +166 -0
  74. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +460 -0
  75. data/ext/isomorfeus_ferret_ext/frt_scanner.c +899 -0
  76. data/ext/isomorfeus_ferret_ext/frt_scanner.h +28 -0
  77. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +6705 -0
  78. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +4419 -0
  79. data/ext/isomorfeus_ferret_ext/frt_search.c +1824 -0
  80. data/ext/isomorfeus_ferret_ext/frt_search.h +924 -0
  81. data/ext/isomorfeus_ferret_ext/frt_similarity.c +150 -0
  82. data/ext/isomorfeus_ferret_ext/frt_similarity.h +79 -0
  83. data/ext/isomorfeus_ferret_ext/frt_sort.c +796 -0
  84. data/ext/isomorfeus_ferret_ext/frt_stopwords.c +395 -0
  85. data/ext/isomorfeus_ferret_ext/frt_store.c +680 -0
  86. data/ext/isomorfeus_ferret_ext/frt_store.h +789 -0
  87. data/ext/isomorfeus_ferret_ext/frt_term_vectors.c +72 -0
  88. data/ext/isomorfeus_ferret_ext/frt_threading.h +23 -0
  89. data/ext/isomorfeus_ferret_ext/frt_win32.h +54 -0
  90. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +409 -0
  91. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +95 -0
  92. data/ext/isomorfeus_ferret_ext/libstemmer.c +93 -0
  93. data/ext/isomorfeus_ferret_ext/libstemmer.h +73 -0
  94. data/ext/isomorfeus_ferret_ext/q_parser.y +1366 -0
  95. data/ext/isomorfeus_ferret_ext/scanner.h +28 -0
  96. data/ext/isomorfeus_ferret_ext/scanner.in +43 -0
  97. data/ext/isomorfeus_ferret_ext/scanner.rl +84 -0
  98. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +200 -0
  99. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +85 -0
  100. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +324 -0
  101. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +7 -0
  102. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +610 -0
  103. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +1104 -0
  105. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +6 -0
  106. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +749 -0
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +7 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +1233 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +490 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +6 -0
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1217 -0
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +7 -0
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +1052 -0
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +6 -0
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +283 -0
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +6 -0
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +735 -0
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +6 -0
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +1003 -0
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +7 -0
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +1079 -0
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +6 -0
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +293 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +984 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +686 -0
  129. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +6 -0
  130. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +325 -0
  131. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +6 -0
  132. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +620 -0
  133. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +6 -0
  134. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +1111 -0
  135. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +6 -0
  136. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +754 -0
  137. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +6 -0
  138. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +1242 -0
  139. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +6 -0
  140. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +495 -0
  141. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +1220 -0
  143. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +6 -0
  144. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +1059 -0
  145. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +6 -0
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +285 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +741 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +1009 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +990 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +680 -0
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +6 -0
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +1083 -0
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +6 -0
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +294 -0
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +6 -0
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +2191 -0
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +6 -0
  162. data/ext/isomorfeus_ferret_ext/stem_api.c +66 -0
  163. data/ext/isomorfeus_ferret_ext/stem_api.h +26 -0
  164. data/ext/isomorfeus_ferret_ext/stem_header.h +57 -0
  165. data/ext/isomorfeus_ferret_ext/stem_modules.h +190 -0
  166. data/ext/isomorfeus_ferret_ext/stem_modules.txt +50 -0
  167. data/ext/isomorfeus_ferret_ext/stem_utilities.c +478 -0
  168. data/ext/isomorfeus_ferret_ext/test.c +850 -0
  169. data/ext/isomorfeus_ferret_ext/test.h +416 -0
  170. data/ext/isomorfeus_ferret_ext/test_1710.c +63 -0
  171. data/ext/isomorfeus_ferret_ext/test_analysis.c +1221 -0
  172. data/ext/isomorfeus_ferret_ext/test_array.c +272 -0
  173. data/ext/isomorfeus_ferret_ext/test_bitvector.c +600 -0
  174. data/ext/isomorfeus_ferret_ext/test_compound_io.c +170 -0
  175. data/ext/isomorfeus_ferret_ext/test_document.c +156 -0
  176. data/ext/isomorfeus_ferret_ext/test_except.c +244 -0
  177. data/ext/isomorfeus_ferret_ext/test_fields.c +522 -0
  178. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +185 -0
  179. data/ext/isomorfeus_ferret_ext/test_filter.c +331 -0
  180. data/ext/isomorfeus_ferret_ext/test_fs_store.c +25 -0
  181. data/ext/isomorfeus_ferret_ext/test_global.c +299 -0
  182. data/ext/isomorfeus_ferret_ext/test_hash.c +485 -0
  183. data/ext/isomorfeus_ferret_ext/test_hashset.c +288 -0
  184. data/ext/isomorfeus_ferret_ext/test_helper.c +47 -0
  185. data/ext/isomorfeus_ferret_ext/test_highlighter.c +548 -0
  186. data/ext/isomorfeus_ferret_ext/test_index.c +2323 -0
  187. data/ext/isomorfeus_ferret_ext/test_lang.c +74 -0
  188. data/ext/isomorfeus_ferret_ext/test_mempool.c +102 -0
  189. data/ext/isomorfeus_ferret_ext/test_multimapper.c +64 -0
  190. data/ext/isomorfeus_ferret_ext/test_priorityqueue.c +213 -0
  191. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +84 -0
  192. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +61 -0
  193. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +241 -0
  194. data/ext/isomorfeus_ferret_ext/test_q_parser.c +464 -0
  195. data/ext/isomorfeus_ferret_ext/test_q_span.c +575 -0
  196. data/ext/isomorfeus_ferret_ext/test_ram_store.c +77 -0
  197. data/ext/isomorfeus_ferret_ext/test_search.c +1874 -0
  198. data/ext/isomorfeus_ferret_ext/test_segments.c +167 -0
  199. data/ext/isomorfeus_ferret_ext/test_similarity.c +25 -0
  200. data/ext/isomorfeus_ferret_ext/test_sort.c +333 -0
  201. data/ext/isomorfeus_ferret_ext/test_store.c +591 -0
  202. data/ext/isomorfeus_ferret_ext/test_store.h +3 -0
  203. data/ext/isomorfeus_ferret_ext/test_term.c +351 -0
  204. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +373 -0
  205. data/ext/isomorfeus_ferret_ext/test_test.c +83 -0
  206. data/ext/isomorfeus_ferret_ext/test_threading.c +188 -0
  207. data/ext/isomorfeus_ferret_ext/testhelper.c +561 -0
  208. data/ext/isomorfeus_ferret_ext/testhelper.h +25 -0
  209. data/ext/isomorfeus_ferret_ext/tests_all.h +87 -0
  210. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +1854 -0
  211. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +1999 -0
  212. data/ext/isomorfeus_ferret_ext/url.rl +27 -0
  213. data/ext/isomorfeus_ferret_ext/word_list.h +15156 -0
  214. data/lib/isomorfeus/ferret/document.rb +132 -0
  215. data/lib/isomorfeus/ferret/field_symbol.rb +85 -0
  216. data/lib/isomorfeus/ferret/index/field_infos.rb +48 -0
  217. data/lib/isomorfeus/ferret/index/index.rb +970 -0
  218. data/lib/isomorfeus/ferret/monitor.rb +323 -0
  219. data/lib/isomorfeus/ferret/stdlib_patches.rb +151 -0
  220. data/lib/isomorfeus/ferret/version.rb +5 -0
  221. data/lib/isomorfeus-ferret.rb +8 -0
  222. metadata +307 -0
@@ -0,0 +1,2577 @@
1
+ #include <locale.h>
2
+ #include "frt_analysis.h"
3
+ #include "isomorfeus_ferret.h"
4
+ #include <ruby/re.h>
5
+ #include <ruby/st.h>
6
+
7
+ static char *frb_locale = NULL;
8
+
9
+ static VALUE mAnalysis;
10
+
11
+ static VALUE cToken;
12
+ static VALUE cAsciiLetterTokenizer;
13
+ static VALUE cLetterTokenizer;
14
+ static VALUE cAsciiWhiteSpaceTokenizer;
15
+ static VALUE cWhiteSpaceTokenizer;
16
+ static VALUE cAsciiStandardTokenizer;
17
+ static VALUE cStandardTokenizer;
18
+ static VALUE cRegExpTokenizer;
19
+
20
+ static VALUE cAsciiLowerCaseFilter;
21
+ static VALUE cLowerCaseFilter;
22
+ static VALUE cStopFilter;
23
+ static VALUE cMappingFilter;
24
+ static VALUE cHyphenFilter;
25
+ static VALUE cStemFilter;
26
+
27
+ static VALUE cAnalyzer;
28
+ static VALUE cAsciiLetterAnalyzer;
29
+ static VALUE cLetterAnalyzer;
30
+ static VALUE cAsciiWhiteSpaceAnalyzer;
31
+ static VALUE cWhiteSpaceAnalyzer;
32
+ static VALUE cAsciiStandardAnalyzer;
33
+ static VALUE cStandardAnalyzer;
34
+ static VALUE cPerFieldAnalyzer;
35
+ static VALUE cRegExpAnalyzer;
36
+
37
+ static VALUE cTokenStream;
38
+
39
+ /* TokenStream Methods */
40
+ static ID id_next;
41
+ static ID id_reset;
42
+ static ID id_clone;
43
+ static ID id_text;
44
+
45
+ /* FrtAnalyzer Methods */
46
+ static ID id_token_stream;
47
+
48
+ static VALUE object_space;
49
+
50
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
51
+ int, struct re_registers *);
52
+
53
+ int
54
+ frb_rb_hash_size(VALUE hash)
55
+ {
56
+ #ifdef RHASH_SIZE
57
+ return RHASH_SIZE(hash);
58
+ #else
59
+ return RHASH(hash)->ntbl->num_entries;
60
+ #endif
61
+ }
62
+
63
+ /****************************************************************************
64
+ *
65
+ * Utility Methods
66
+ *
67
+ ****************************************************************************/
68
+
69
+ static char **
70
+ get_stopwords(VALUE rstop_words)
71
+ {
72
+ char **stop_words;
73
+ int i, len;
74
+ VALUE rstr;
75
+ Check_Type(rstop_words, T_ARRAY);
76
+ len = RARRAY_LEN(rstop_words);
77
+ stop_words = FRT_ALLOC_N(char *, RARRAY_LEN(rstop_words) + 1);
78
+ stop_words[len] = NULL;
79
+ for (i = 0; i < len; i++) {
80
+ rstr = rb_obj_as_string(RARRAY_PTR(rstop_words)[i]);
81
+ stop_words[i] = rs2s(rstr);
82
+ }
83
+ return stop_words;
84
+ }
85
+
86
+ /****************************************************************************
87
+ *
88
+ * token methods
89
+ *
90
+ ****************************************************************************/
91
+
92
+ typedef struct RToken {
93
+ VALUE text;
94
+ int start;
95
+ int end;
96
+ int pos_inc;
97
+ } RToken;
98
+
99
+ static void
100
+ frb_token_free(void *p)
101
+ {
102
+ free(p);
103
+ }
104
+
105
+ static void
106
+ frb_token_mark(void *p)
107
+ {
108
+ RToken *token = (RToken *)p;
109
+ rb_gc_mark(token->text);
110
+ }
111
+
112
+ static VALUE
113
+ frb_token_alloc(VALUE klass)
114
+ {
115
+ return Data_Wrap_Struct(klass, &frb_token_mark, &frb_token_free,
116
+ ALLOC(RToken));
117
+ }
118
+
119
+ static VALUE
120
+ get_token(FrtToken *tk)
121
+ {
122
+ RToken *token = ALLOC(RToken);
123
+
124
+ token->text = rb_str_new2(tk->text);
125
+ token->start = tk->start;
126
+ token->end = tk->end;
127
+ token->pos_inc = tk->pos_inc;
128
+ return Data_Wrap_Struct(cToken, &frb_token_mark, &frb_token_free, token);
129
+ }
130
+
131
+ FrtToken *
132
+ frb_set_token(FrtToken *tk, VALUE rt)
133
+ {
134
+ RToken *rtk;
135
+
136
+ if (rt == Qnil) return NULL;
137
+
138
+ Data_Get_Struct(rt, RToken, rtk);
139
+ frt_tk_set(tk, rs2s(rtk->text), RSTRING_LEN(rtk->text),
140
+ rtk->start, rtk->end, rtk->pos_inc);
141
+ return tk;
142
+ }
143
+
144
+ #define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
145
+
146
+ /*
147
+ * call-seq:
148
+ * Token.new(text, start, end, pos_inc = 1) -> new Token
149
+ *
150
+ * Creates a new token setting the text, start and end offsets of the token
151
+ * and the position increment for the token.
152
+ *
153
+ * The position increment is usually set to 1 but you can set it to other
154
+ * values as needed. For example, if you have a stop word filter you will be
155
+ * skipping tokens. Let's say you have the stop words "the" and "and" and you
156
+ * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
157
+ * "Sea" will have the position increments 2, 1 and 3 respectively.
158
+ *
159
+ * Another reason you might want to vary the position increment is if you are
160
+ * adding synonyms to the index. For example let's say you have the synonym
161
+ * group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
162
+ * speedy delivery", you'll add "speedy" first with a position increment of 1
163
+ * and then "fast" and "quick" with position increments of 0 since they are
164
+ * represented in the same position.
165
+ *
166
+ * The offset set values +start+ and +end+ should be byte offsets, not
167
+ * character offsets. This makes it easy to use those offsets to quickly
168
+ * access the token in the input string and also to insert highlighting tags
169
+ * when necessary.
170
+ *
171
+ * text:: the main text for the token.
172
+ * start:: the start offset of the token in bytes.
173
+ * end:: the end offset of the token in bytes.
174
+ * pos_inc:: the position increment of a token. See above.
175
+ * return:: a newly created and assigned Token object
176
+ */
177
+ static VALUE
178
+ frb_token_init(int argc, VALUE *argv, VALUE self)
179
+ {
180
+ RToken *token;
181
+ VALUE rtext, rstart, rend, rpos_inc, rtype;
182
+ GET_TK(token, self);
183
+ token->pos_inc = 1;
184
+ switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
185
+ &rend, &rpos_inc, &rtype)) {
186
+ case 5: /* type gets ignored at this stage */
187
+ case 4: token->pos_inc = FIX2INT(rpos_inc);
188
+ }
189
+ token->text = rb_obj_as_string(rtext);
190
+ token->start = FIX2INT(rstart);
191
+ token->end = FIX2INT(rend);
192
+ return self;
193
+ }
194
+
195
+ /*
196
+ * call-seq:
197
+ * token.cmp(other_token) -> bool
198
+ *
199
+ * Used to compare two tokens. Token is extended by Comparable so you can
200
+ * also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
201
+ *
202
+ * Tokens are sorted by the position in the text at which they occur, ie
203
+ * the start offset. If two tokens have the same start offset, (see
204
+ * pos_inc=) then, they are sorted by the end offset and then
205
+ * lexically by the token text.
206
+ */
207
+ static VALUE
208
+ frb_token_cmp(VALUE self, VALUE rother)
209
+ {
210
+ RToken *token, *other;
211
+ int cmp;
212
+ GET_TK(token, self);
213
+ GET_TK(other, rother);
214
+ if (token->start > other->start) {
215
+ cmp = 1;
216
+ } else if (token->start < other->start) {
217
+ cmp = -1;
218
+ } else {
219
+ if (token->end > other->end) {
220
+ cmp = 1;
221
+ } else if (token->end < other->end) {
222
+ cmp = -1;
223
+ } else {
224
+ cmp = strcmp(rs2s(token->text), rs2s(other->text));
225
+ }
226
+ }
227
+ return INT2FIX(cmp);
228
+ }
229
+
230
+ /*
231
+ * call-seq:
232
+ * token.text -> text
233
+ *
234
+ * Returns the text that this token represents
235
+ */
236
+ static VALUE
237
+ frb_token_get_text(VALUE self)
238
+ {
239
+ RToken *token;
240
+ GET_TK(token, self);
241
+ return token->text;
242
+ }
243
+
244
+ /*
245
+ * call-seq:
246
+ * token.text = text -> text
247
+ *
248
+ * Set the text for this token.
249
+ */
250
+ static VALUE
251
+ frb_token_set_text(VALUE self, VALUE rtext)
252
+ {
253
+ RToken *token;
254
+ GET_TK(token, self);
255
+ token->text = rtext;
256
+ return rtext;
257
+ }
258
+
259
+ /*
260
+ * call-seq:
261
+ * token.start -> integer
262
+ *
263
+ * Start byte-position of this token
264
+ */
265
+ static VALUE
266
+ frb_token_get_start_offset(VALUE self)
267
+ {
268
+ RToken *token;
269
+ GET_TK(token, self);
270
+ return INT2FIX(token->start);
271
+ }
272
+
273
+ /*
274
+ * call-seq:
275
+ * token.end -> integer
276
+ *
277
+ * End byte-position of this token
278
+ */
279
+ static VALUE
280
+ frb_token_get_end_offset(VALUE self)
281
+ {
282
+ RToken *token;
283
+ GET_TK(token, self);
284
+ return INT2FIX(token->end);
285
+ }
286
+
287
+ /*
288
+ * call-seq:
289
+ * token.pos_inc -> integer
290
+ *
291
+ * Position Increment for this token
292
+ */
293
+ static VALUE
294
+ frb_token_get_pos_inc(VALUE self)
295
+ {
296
+ RToken *token;
297
+ GET_TK(token, self);
298
+ return INT2FIX(token->pos_inc);
299
+ }
300
+
301
+ /*
302
+ * call-seq:
303
+ * token.start = start -> integer
304
+ *
305
+ * Set start byte-position of this token
306
+ */
307
+ static VALUE
308
+ frb_token_set_start_offset(VALUE self, VALUE rstart)
309
+ {
310
+ RToken *token;
311
+ GET_TK(token, self);
312
+ token->start = FIX2INT(rstart);
313
+ return rstart;
314
+ }
315
+
316
+ /*
317
+ * call-seq:
318
+ * token.end = end -> integer
319
+ *
320
+ * Set end byte-position of this token
321
+ */
322
+ static VALUE
323
+ frb_token_set_end_offset(VALUE self, VALUE rend)
324
+ {
325
+ RToken *token;
326
+ GET_TK(token, self);
327
+ token->end = FIX2INT(rend);
328
+ return rend;
329
+ }
330
+
331
+ /*
332
+ * call-seq:
333
+ * token.pos_inc = pos_inc -> integer
334
+ *
335
+ * Set the position increment. This determines the position of this token
336
+ * relative to the previous Token in a TokenStream, used in phrase
337
+ * searching.
338
+ *
339
+ * The default value is 1.
340
+ *
341
+ * Some common uses for this are:
342
+ *
343
+ * * Set it to zero to put multiple terms in the same position. This is
344
+ * useful if, e.g., a word has multiple stems. Searches for phrases
345
+ * including either stem will match. In this case, all but the first
346
+ * stem's increment should be set to zero: the increment of the first
347
+ * instance should be one. Repeating a token with an increment of zero
348
+ * can also be used to boost the scores of matches on that token.
349
+ *
350
+ * * Set it to values greater than one to inhibit exact phrase matches.
351
+ * If, for example, one does not want phrases to match across removed
352
+ * stop words, then one could build a stop word filter that removes stop
353
+ * words and also sets the increment to the number of stop words removed
354
+ * before each non-stop word. Then exact phrase queries will only match
355
+ * when the terms occur with no intervening stop words.
356
+ *
357
+ */
358
+ static VALUE
359
+ frb_token_set_pos_inc(VALUE self, VALUE rpos_inc)
360
+ {
361
+ RToken *token;
362
+ GET_TK(token, self);
363
+ token->pos_inc = FIX2INT(rpos_inc);
364
+ return rpos_inc;
365
+ }
366
+
367
+ /*
368
+ * call-seq:
369
+ * token.to_s -> token_str
370
+ *
371
+ * Return a string representation of the token
372
+ */
373
+ static VALUE
374
+ frb_token_to_s(VALUE self)
375
+ {
376
+ RToken *token;
377
+ char *buf;
378
+ GET_TK(token, self);
379
+ buf = alloca(RSTRING_LEN(token->text) + 80);
380
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
381
+ token->start, token->end, token->pos_inc);
382
+ return rb_str_new2(buf);
383
+ }
384
+
385
+ /****************************************************************************
386
+ *
387
+ * TokenStream Methods
388
+ *
389
+ ****************************************************************************/
390
+
391
+ #define GET_TS(ts, self) Data_Get_Struct(self, FrtTokenStream, ts)
392
+
393
+ static void
394
+ frb_ts_mark(void *p)
395
+ {
396
+ FrtTokenStream *ts = (FrtTokenStream *)p;
397
+ if (ts->text) frb_gc_mark(&ts->text);
398
+ }
399
+
400
+ static void
401
+ frb_ts_free(FrtTokenStream *ts)
402
+ {
403
+ if (object_get(&ts->text) != Qnil) {
404
+ object_del(&ts->text);
405
+ }
406
+ object_del(ts);
407
+ frt_ts_deref(ts);
408
+ }
409
+
410
+ static void frb_rets_free(FrtTokenStream *ts);
411
+ static void frb_rets_mark(FrtTokenStream *ts);
412
+ static FrtToken *rets_next(FrtTokenStream *ts);
413
+
414
+ static VALUE
415
+ get_rb_token_stream(FrtTokenStream *ts)
416
+ {
417
+ VALUE rts = object_get(ts);
418
+ if (rts == Qnil) {
419
+ if (ts->next == &rets_next) {
420
+ rts = Data_Wrap_Struct(cTokenStream, &frb_rets_mark,
421
+ &frb_rets_free, ts);
422
+ } else {
423
+ rts = Data_Wrap_Struct(cTokenStream, &frb_ts_mark,
424
+ &frb_ts_free, ts);
425
+ }
426
+ object_add(ts, rts);
427
+ }
428
+ return rts;
429
+ }
430
+
431
+ static VALUE
432
+ get_wrapped_ts(VALUE self, VALUE rstr, FrtTokenStream *ts)
433
+ {
434
+ StringValue(rstr);
435
+ ts->reset(ts, rs2s(rstr));
436
+ Frt_Wrap_Struct(self, &frb_ts_mark, &frb_ts_free, ts);
437
+ object_add(&ts->text, rstr);
438
+ object_add(ts, self);
439
+ return self;
440
+ }
441
+
442
+ /*
443
+ * call-seq:
444
+ * token_stream.text = text -> text
445
+ *
446
+ * Set the text attribute of the TokenStream to the text you wish to be
447
+ * tokenized. For example, you may do this;
448
+ *
449
+ * token_stream.text = File.read(file_name)
450
+ */
451
+ static VALUE
452
+ frb_ts_set_text(VALUE self, VALUE rtext)
453
+ {
454
+ FrtTokenStream *ts;
455
+ Data_Get_Struct(self, FrtTokenStream, ts);
456
+ StringValue(rtext);
457
+ ts->reset(ts, rs2s(rtext));
458
+
459
+ /* prevent garbage collection */
460
+ rb_ivar_set(self, id_text, rtext);
461
+
462
+ return rtext;
463
+ }
464
+
465
+ /*
466
+ * call-seq:
467
+ * token_stream.text = text -> text
468
+ *
469
+ * Return the text that the TokenStream is tokenizing
470
+ */
471
+ static VALUE
472
+ frb_ts_get_text(VALUE self)
473
+ {
474
+ VALUE rtext = Qnil;
475
+ FrtTokenStream *ts;
476
+ Data_Get_Struct(self, FrtTokenStream, ts);
477
+ if ((rtext = object_get(&ts->text)) == Qnil) {
478
+ if (ts->text) {
479
+ rtext = rb_str_new2(ts->text);
480
+ object_set(&ts->text, rtext);
481
+ }
482
+ }
483
+ return rtext;
484
+ }
485
+
486
+ /*
487
+ * call-seq:
488
+ * token_stream.next -> token
489
+ *
490
+ * Return the next token from the TokenStream or nil if there are no more
491
+ * tokens.
492
+ */
493
+ static VALUE
494
+ frb_ts_next(VALUE self)
495
+ {
496
+ FrtTokenStream *ts;
497
+ FrtToken *next;
498
+ GET_TS(ts, self);
499
+ next = ts->next(ts);
500
+ if (next == NULL) {
501
+ return Qnil;
502
+ }
503
+
504
+ return get_token(next);
505
+ }
506
+
507
+ /****************************************************************************
508
+ * TokenFilter
509
+ ****************************************************************************/
510
+
511
+ #define TkFilt(filter) ((FrtTokenFilter *)(filter))
512
+
513
+ static void
514
+ frb_tf_mark(void *p)
515
+ {
516
+ FrtTokenStream *ts = (FrtTokenStream *)p;
517
+ if (TkFilt(ts)->sub_ts) {
518
+ frb_gc_mark(&TkFilt(ts)->sub_ts);
519
+ }
520
+ }
521
+
522
+ static void
523
+ frb_tf_free(FrtTokenStream *ts)
524
+ {
525
+ if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
526
+ object_del(&TkFilt(ts)->sub_ts);
527
+ }
528
+ object_del(ts);
529
+ frt_ts_deref(ts);
530
+ }
531
+
532
+
533
+ /****************************************************************************
534
+ * CWrappedTokenStream
535
+ ****************************************************************************/
536
+
537
+ #define CachedTS(token_stream) ((FrtCachedTokenStream *)(token_stream))
538
+ #define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
539
+
540
+ typedef struct CWrappedTokenStream {
541
+ FrtCachedTokenStream super;
542
+ VALUE rts;
543
+ } CWrappedTokenStream;
544
+
545
+ static void
546
+ cwrts_destroy_i(FrtTokenStream *ts)
547
+ {
548
+ if (object_get(&ts->text) != Qnil) {
549
+ object_del(&ts->text);
550
+ }
551
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
552
+ free(ts);
553
+ }
554
+
555
+ static FrtToken *
556
+ cwrts_next(FrtTokenStream *ts)
557
+ {
558
+ VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
559
+ return frb_set_token(&(CachedTS(ts)->token), rtoken);
560
+ }
561
+
562
+ static FrtTokenStream *
563
+ cwrts_reset(FrtTokenStream *ts, char *text)
564
+ {
565
+ ts->t = ts->text = text;
566
+ rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
567
+ return ts;
568
+ }
569
+
570
+ static FrtTokenStream *
571
+ cwrts_clone_i(FrtTokenStream *orig_ts)
572
+ {
573
+ FrtTokenStream *new_ts = frt_ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
574
+ VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
575
+ rb_hash_aset(object_space, ((VALUE)new_ts)|1, rts);
576
+ return new_ts;
577
+ }
578
+
579
+ static FrtTokenStream *
580
+ frb_get_cwrapped_rts(VALUE rts)
581
+ {
582
+ FrtTokenStream *ts;
583
+ if (frb_is_cclass(rts) && DATA_PTR(rts)) {
584
+ GET_TS(ts, rts);
585
+ FRT_REF(ts);
586
+ }
587
+ else {
588
+ ts = frt_ts_new(CWrappedTokenStream);
589
+ CWTS(ts)->rts = rts;
590
+ ts->next = &cwrts_next;
591
+ ts->reset = &cwrts_reset;
592
+ ts->clone_i = &cwrts_clone_i;
593
+ ts->destroy_i = &cwrts_destroy_i;
594
+ /* prevent from being garbage collected */
595
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rts);
596
+ ts->ref_cnt = 1;
597
+ }
598
+ return ts;
599
+ }
600
+
601
+ /****************************************************************************
602
+ * RegExpTokenStream
603
+ ****************************************************************************/
604
+
605
+ #define P "[_\\/.,-]"
606
+ #define HASDIGIT "\\w*\\d\\w*"
607
+ #define ALPHA "[-_[:alpha:]]"
608
+ #define ALNUM "[-_[:alnum:]]"
609
+
610
+ #define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
611
+
612
+ static const char *TOKEN_RE =
613
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
614
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
615
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
616
+ "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
617
+ "|(\\.\\w+)+"
618
+ "|"
619
+ ")";
620
+ static VALUE rtoken_re;
621
+
622
+ typedef struct RegExpTokenStream {
623
+ FrtCachedTokenStream super;
624
+ VALUE rtext;
625
+ VALUE regex;
626
+ VALUE proc;
627
+ long curr_ind;
628
+ } RegExpTokenStream;
629
+
630
+ static void
631
+ rets_destroy_i(FrtTokenStream *ts)
632
+ {
633
+ if (object_get(&ts->text) != Qnil) {
634
+ object_del(&ts->text);
635
+ }
636
+ rb_hash_delete(object_space, ((VALUE)ts)|1);
637
+ free(ts);
638
+ }
639
+
640
+ static void
641
+ frb_rets_free(FrtTokenStream *ts)
642
+ {
643
+ if (object_get(&ts->text) != Qnil) {
644
+ object_del(&ts->text);
645
+ }
646
+ object_del(ts);
647
+ frt_ts_deref(ts);
648
+ }
649
+
650
+ static void
651
+ frb_rets_mark(FrtTokenStream *ts)
652
+ {
653
+ if (ts->text) frb_gc_mark(&ts->text);
654
+ rb_gc_mark(RETS(ts)->rtext);
655
+ rb_gc_mark(RETS(ts)->regex);
656
+ rb_gc_mark(RETS(ts)->proc);
657
+ }
658
+
659
+ /*
660
+ * call-seq:
661
+ * tokenizer.text = text -> text
662
+ *
663
+ * Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
664
+ * tokenize the text from the beginning.
665
+ */
666
+ static VALUE
667
+ frb_rets_set_text(VALUE self, VALUE rtext)
668
+ {
669
+ FrtTokenStream *ts;
670
+ GET_TS(ts, self);
671
+
672
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
673
+ StringValue(rtext);
674
+ RETS(ts)->rtext = rtext;
675
+ RETS(ts)->curr_ind = 0;
676
+
677
+ return rtext;
678
+ }
679
+
680
+ /*
681
+ * call-seq:
682
+ * tokenizer.text = text -> text
683
+ *
684
+ * Get the text being tokenized by the tokenizer.
685
+ */
686
+ static VALUE
687
+ frb_rets_get_text(VALUE self)
688
+ {
689
+ FrtTokenStream *ts;
690
+ GET_TS(ts, self);
691
+ return RETS(ts)->rtext;
692
+ }
693
+
694
+ // partly lifted from ruby 1.9 string.c
695
+ #include <ruby/encoding.h>
696
+ #define BEG(no) regs->beg[no]
697
+ #define END(no) regs->end[no]
698
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
699
+ static VALUE
700
+ scan_once(VALUE str, VALUE pat, long *start)
701
+ {
702
+ VALUE match;
703
+ struct re_registers *regs;
704
+
705
+ if (rb_reg_search(pat, str, *start, 0) >= 0) {
706
+ match = rb_backref_get();
707
+ regs = RMATCH_REGS(match);
708
+ if (BEG(0) == END(0)) {
709
+ rb_encoding *enc = STR_ENC_GET(str);
710
+ /*
711
+ * Always consume at least one character of the input string
712
+ */
713
+ if (RSTRING_LEN(str) > END(0))
714
+ *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
715
+ RSTRING_END(str), enc);
716
+ else
717
+ *start = END(0)+1;
718
+ }
719
+ else {
720
+ *start = END(0);
721
+ }
722
+ return rb_reg_nth_match(0, match);
723
+ }
724
+ return Qnil;
725
+ }
726
+ //
727
+
728
+ static FrtToken *
729
+ rets_next(FrtTokenStream *ts)
730
+ {
731
+ VALUE ret;
732
+ long rtok_len;
733
+ int beg, end;
734
+ Check_Type(RETS(ts)->regex, T_REGEXP);
735
+ ret = scan_once(RETS(ts)->rtext, RETS(ts)->regex, &(RETS(ts)->curr_ind));
736
+ if (NIL_P(ret)) return NULL;
737
+
738
+ Check_Type(ret, T_STRING);
739
+ rtok_len = RSTRING_LEN(ret);
740
+ beg = RETS(ts)->curr_ind - rtok_len;
741
+ end = RETS(ts)->curr_ind;
742
+
743
+ if (NIL_P(RETS(ts)->proc)) {
744
+ return frt_tk_set(&(CachedTS(ts)->token), rs2s(ret), rtok_len,
745
+ beg, end, 1);
746
+ } else {
747
+ VALUE rtok;
748
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
749
+ return frt_tk_set(&(CachedTS(ts)->token), rs2s(rtok),
750
+ RSTRING_LEN(rtok), beg, end, 1);
751
+ }
752
+ }
753
+
754
+ static FrtTokenStream *
755
+ rets_reset(FrtTokenStream *ts, char *text)
756
+ {
757
+ RETS(ts)->rtext = rb_str_new2(text);
758
+ RETS(ts)->curr_ind = 0;
759
+ return ts;
760
+ }
761
+
762
+ static FrtTokenStream *
763
+ rets_clone_i(FrtTokenStream *orig_ts)
764
+ {
765
+ FrtTokenStream *ts = frt_ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
766
+ return ts;
767
+ }
768
+
769
+ static FrtTokenStream *
770
+ rets_new(VALUE rtext, VALUE regex, VALUE proc)
771
+ {
772
+ FrtTokenStream *ts = frt_ts_new(RegExpTokenStream);
773
+
774
+ if (rtext != Qnil) {
775
+ rtext = StringValue(rtext);
776
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
777
+ }
778
+ ts->reset = &rets_reset;
779
+ ts->next = &rets_next;
780
+ ts->clone_i = &rets_clone_i;
781
+ ts->destroy_i = &rets_destroy_i;
782
+
783
+ RETS(ts)->curr_ind = 0;
784
+ RETS(ts)->rtext = rtext;
785
+ RETS(ts)->proc = proc;
786
+
787
+ if (NIL_P(regex)) {
788
+ RETS(ts)->regex = rtoken_re;
789
+ } else {
790
+ Check_Type(regex, T_REGEXP);
791
+ RETS(ts)->regex = regex;
792
+ }
793
+
794
+ return ts;
795
+ }
796
+
797
+ /*
798
+ * call-seq:
799
+ * RegExpTokenizer.new(input, /[[:alpha:]]+/)
800
+ *
801
+ * Create a new tokenizer based on a regular expression
802
+ *
803
+ * input:: text to tokenizer
804
+ * regexp:: regular expression used to recognize tokens in the input
805
+ */
806
+ static VALUE
807
+ frb_rets_init(int argc, VALUE *argv, VALUE self)
808
+ {
809
+ VALUE rtext, regex, proc;
810
+ FrtTokenStream *ts;
811
+
812
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
813
+
814
+ ts = rets_new(rtext, regex, proc);
815
+
816
+ Frt_Wrap_Struct(self, &frb_rets_mark, &frb_rets_free, ts);
817
+ object_add(ts, self);
818
+ return self;
819
+ }
820
+
821
+ /****************************************************************************
822
+ * Tokenizers
823
+ ****************************************************************************/
824
+
825
+ #define TS_ARGS(dflt) \
826
+ bool lower;\
827
+ VALUE rlower, rstr;\
828
+ rb_scan_args(argc, argv, "11", &rstr, &rlower);\
829
+ lower = (argc ? RTEST(rlower) : dflt)
830
+
831
+ /*
832
+ * call-seq:
833
+ * AsciiLetterTokenizer.new() -> tokenizer
834
+ *
835
+ * Create a new AsciiLetterTokenizer
836
+ */
837
+ static VALUE
838
+ frb_a_letter_tokenizer_init(VALUE self, VALUE rstr)
839
+ {
840
+ return get_wrapped_ts(self, rstr, frt_letter_tokenizer_new());
841
+ }
842
+
843
+ /*
844
+ * call-seq:
845
+ * LetterTokenizer.new(lower = true) -> tokenizer
846
+ *
847
+ * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
848
+ * is done according the current locale.
849
+ *
850
+ * lower:: set to false if you don't wish to downcase tokens
851
+ */
852
+ static VALUE
853
+ frb_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
854
+ {
855
+ TS_ARGS(false);
856
+ #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
857
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
858
+ #endif
859
+ return get_wrapped_ts(self, rstr, frt_mb_letter_tokenizer_new(lower));
860
+ }
861
+
862
+ /*
863
+ * call-seq:
864
+ * AsciiWhiteSpaceTokenizer.new() -> tokenizer
865
+ *
866
+ * Create a new AsciiWhiteSpaceTokenizer
867
+ */
868
+ static VALUE
869
+ frb_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
870
+ {
871
+ return get_wrapped_ts(self, rstr, frt_whitespace_tokenizer_new());
872
+ }
873
+
874
+ /*
875
+ * call-seq:
876
+ * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
877
+ *
878
+ * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
879
+ * Downcasing is done according the current locale.
880
+ *
881
+ * lower:: set to false if you don't wish to downcase tokens
882
+ */
883
+ static VALUE
884
+ frb_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
885
+ {
886
+ TS_ARGS(false);
887
+ #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
888
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
889
+ #endif
890
+ return get_wrapped_ts(self, rstr, frt_mb_whitespace_tokenizer_new(lower));
891
+ }
892
+
893
+ /*
894
+ * call-seq:
895
+ * AsciiStandardTokenizer.new() -> tokenizer
896
+ *
897
+ * Create a new AsciiStandardTokenizer
898
+ */
899
+ static VALUE
900
+ frb_a_standard_tokenizer_init(VALUE self, VALUE rstr)
901
+ {
902
+ return get_wrapped_ts(self, rstr, frt_standard_tokenizer_new());
903
+ }
904
+
905
+ /*
906
+ * call-seq:
907
+ * StandardTokenizer.new(lower = true) -> tokenizer
908
+ *
909
+ * Create a new StandardTokenizer which optionally downcases tokens.
910
+ * Downcasing is done according the current locale.
911
+ *
912
+ * lower:: set to false if you don't wish to downcase tokens
913
+ */
914
+ static VALUE
915
+ frb_standard_tokenizer_init(VALUE self, VALUE rstr)
916
+ {
917
+ #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
918
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
919
+ #endif
920
+ return get_wrapped_ts(self, rstr, frt_mb_standard_tokenizer_new());
921
+ }
922
+
923
+ /****************************************************************************
924
+ * Filters
925
+ ****************************************************************************/
926
+
927
+
928
+ /*
929
+ * call-seq:
930
+ * AsciiLowerCaseFilter.new(token_stream) -> token_stream
931
+ *
932
+ * Create an AsciiLowerCaseFilter which normalizes a token's text to
933
+ * lowercase but only for ASCII characters. For other characters use
934
+ * LowerCaseFilter.
935
+ */
936
+ static VALUE
937
+ frb_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
938
+ {
939
+ FrtTokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
940
+ ts = frt_lowercase_filter_new(ts);
941
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
942
+
943
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
944
+ object_add(ts, self);
945
+ return self;
946
+ }
947
+
948
+ /*
949
+ * call-seq:
950
+ * LowerCaseFilter.new(token_stream) -> token_stream
951
+ *
952
+ * Create an LowerCaseFilter which normalizes a token's text to
953
+ * lowercase based on the current locale.
954
+ */
955
+ static VALUE
956
+ frb_lowercase_filter_init(VALUE self, VALUE rsub_ts)
957
+ {
958
+ FrtTokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
959
+ #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
960
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
961
+ #endif
962
+ ts = frt_mb_lowercase_filter_new(ts);
963
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
964
+
965
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
966
+ object_add(ts, self);
967
+ return self;
968
+ }
969
+
970
+ /*
971
+ * call-seq:
972
+ * HyphenFilter.new(token_stream) -> token_stream
973
+ *
974
+ * Create an HyphenFilter which filters hyphenated words. The way it works is
975
+ * by adding both the word concatenated into a single word and split into
976
+ * multiple words. ie "e-mail" becomes "email" and "e mail". This way a
977
+ * search for "e-mail", "email" and "mail" will all match. This filter is
978
+ * used by default by the StandardAnalyzer.
979
+ */
980
+ static VALUE
981
+ frb_hyphen_filter_init(VALUE self, VALUE rsub_ts)
982
+ {
983
+ FrtTokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
984
+ ts = frt_hyphen_filter_new(ts);
985
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
986
+
987
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
988
+ object_add(ts, self);
989
+ return self;
990
+ }
991
+
992
+ /*
993
+ * call-seq:
994
+ * StopFilter.new(token_stream) -> token_stream
995
+ * StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
996
+ *
997
+ * Create an StopFilter which removes *stop-words* from a TokenStream. You can
998
+ * optionally specify the stopwords you wish to have removed.
999
+ *
1000
+ * token_stream:: TokenStream to be filtered
1001
+ * stop_words:: Array of *stop-words* you wish to be filtered out. This
1002
+ * defaults to a list of English stop-words. The
1003
+ * Ferret::Analysis contains a number of stop-word lists.
1004
+ */
1005
+ static VALUE
1006
+ frb_stop_filter_init(int argc, VALUE *argv, VALUE self)
1007
+ {
1008
+ VALUE rsub_ts, rstop_words;
1009
+ FrtTokenStream *ts;
1010
+ rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
1011
+ ts = frb_get_cwrapped_rts(rsub_ts);
1012
+ if (rstop_words != Qnil) {
1013
+ char **stop_words = get_stopwords(rstop_words);
1014
+ ts = frt_stop_filter_new_with_words(ts, (const char **)stop_words);
1015
+
1016
+ free(stop_words);
1017
+ } else {
1018
+ ts = frt_stop_filter_new(ts);
1019
+ }
1020
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1021
+
1022
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1023
+ object_add(ts, self);
1024
+ return self;
1025
+ }
1026
+
1027
+ static void frb_add_mapping_i(FrtTokenStream *mf, VALUE from,
1028
+ const char *to)
1029
+ {
1030
+ switch (TYPE(from)) {
1031
+ case T_STRING:
1032
+ frt_mapping_filter_add(mf, rs2s(from), to);
1033
+ break;
1034
+ case T_SYMBOL:
1035
+ frt_mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
1036
+ break;
1037
+ default:
1038
+ rb_raise(rb_eArgError,
1039
+ "cannot map from %s with MappingFilter",
1040
+ rs2s(rb_obj_as_string(from)));
1041
+ break;
1042
+ }
1043
+ }
1044
+
1045
+ static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1046
+ {
1047
+ if (key == Qundef) {
1048
+ return ST_CONTINUE;
1049
+ } else {
1050
+ FrtTokenStream *mf = (FrtTokenStream *)arg;
1051
+ const char *to;
1052
+ switch (TYPE(value)) {
1053
+ case T_STRING:
1054
+ to = rs2s(value);
1055
+ break;
1056
+ case T_SYMBOL:
1057
+ to = rb_id2name(SYM2ID(value));
1058
+ break;
1059
+ default:
1060
+ rb_raise(rb_eArgError,
1061
+ "cannot map to %s with MappingFilter",
1062
+ rs2s(rb_obj_as_string(key)));
1063
+ break;
1064
+ }
1065
+ if (TYPE(key) == T_ARRAY) {
1066
+ int i;
1067
+ for (i = RARRAY_LEN(key) - 1; i >= 0; i--) {
1068
+ frb_add_mapping_i(mf, RARRAY_PTR(key)[i], to);
1069
+ }
1070
+ }
1071
+ else {
1072
+ frb_add_mapping_i(mf, key, to);
1073
+ }
1074
+ }
1075
+ return ST_CONTINUE;
1076
+ }
1077
+
1078
+
1079
+ /*
1080
+ * call-seq:
1081
+ * MappingFilter.new(token_stream, mapping) -> token_stream
1082
+ *
1083
+ * Create an MappingFilter which maps strings in tokens. This is usually used
1084
+ * to map UTF-8 characters to ASCII characters for easier searching and
1085
+ * better search recall. The mapping is compiled into a Deterministic Finite
1086
+ * Automata so it is super fast. This Filter can therefor be used for
1087
+ * indexing very large datasets. Currently regular expressions are not
1088
+ * supported. If you are really interested in the feature, please contact me
1089
+ * at dbalmain@gmail.com.
1090
+ *
1091
+ * token_stream:: TokenStream to be filtered
1092
+ * mapping:: Hash of mappings to apply to tokens. The key can be a
1093
+ * String or an Array of Strings. The value must be a String
1094
+ *
1095
+ * == Example
1096
+ *
1097
+ * filt = MappingFilter.new(token_stream,
1098
+ * {
1099
+ * ['à','á','â','ã','ä','å'] => 'a',
1100
+ * ['è','é','ê','ë','ē','ę'] => 'e'
1101
+ * })
1102
+ */
1103
+ static VALUE
1104
+ frb_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1105
+ {
1106
+ FrtTokenStream *ts;
1107
+ ts = frb_get_cwrapped_rts(rsub_ts);
1108
+ ts = frt_mapping_filter_new(ts);
1109
+ rb_hash_foreach(mapping, frb_add_mappings_i, (VALUE)ts);
1110
+ frt_mulmap_compile(((FrtMappingFilter *)ts)->mapper);
1111
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1112
+
1113
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1114
+ object_add(ts, self);
1115
+ return self;
1116
+ }
1117
+
1118
+ /*
1119
+ * call-seq:
1120
+ * StemFilter.new(token_stream) -> token_stream
1121
+ * StemFilter.new(token_stream,
1122
+ * algorithm="english",
1123
+ * encoding="UTF-8") -> token_stream
1124
+ *
1125
+ * Create an StemFilter which uses a snowball stemmer (thank you Martin
1126
+ * Porter) to stem words. You can optionally specify the algorithm (default:
1127
+ * "english") and encoding (default: "UTF-8").
1128
+ *
1129
+ * token_stream:: TokenStream to be filtered
1130
+ * algorithm:: The algorithm (or language) to use
1131
+ * encoding:: The encoding of the data (default: "UTF-8")
1132
+ */
1133
+ static VALUE
1134
+ frb_stem_filter_init(int argc, VALUE *argv, VALUE self)
1135
+ {
1136
+ VALUE rsub_ts, ralgorithm, rcharenc;
1137
+ const char *algorithm = "english";
1138
+ char *charenc = NULL;
1139
+ FrtTokenStream *ts;
1140
+ rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
1141
+ ts = frb_get_cwrapped_rts(rsub_ts);
1142
+ switch (argc) {
1143
+ case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
1144
+ case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
1145
+ }
1146
+ ts = frt_stem_filter_new(ts, algorithm, charenc);
1147
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1148
+
1149
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1150
+ object_add(ts, self);
1151
+ if (((FrtStemFilter *)ts)->stemmer == NULL) {
1152
+ rb_raise(rb_eArgError, "No stemmer could be found with the encoding "
1153
+ "%s and the language %s", charenc, algorithm);
1154
+ }
1155
+ return self;
1156
+ }
1157
+
1158
+ /****************************************************************************
1159
+ *
1160
+ * FrtAnalyzer Methods
1161
+ *
1162
+ ****************************************************************************/
1163
+
1164
+ /****************************************************************************
1165
+ * CWrappedAnalyzer Methods
1166
+ ****************************************************************************/
1167
+
1168
+ #define GET_A(a, self) Data_Get_Struct(self, FrtAnalyzer, a)
1169
+
1170
+ #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
1171
+ typedef struct CWrappedAnalyzer
1172
+ {
1173
+ FrtAnalyzer super;
1174
+ VALUE ranalyzer;
1175
+ } CWrappedAnalyzer;
1176
+
1177
+ static void
1178
+ cwa_destroy_i(FrtAnalyzer *a)
1179
+ {
1180
+ rb_hash_delete(object_space, ((VALUE)a)|1);
1181
+ free(a);
1182
+ }
1183
+
1184
+ static FrtTokenStream *
1185
+ cwa_get_ts(FrtAnalyzer *a, FrtSymbol field, char *text)
1186
+ {
1187
+ VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1188
+ rb_str_new_cstr(rb_id2name(field)), rb_str_new_cstr(text));
1189
+ return frb_get_cwrapped_rts(rts);
1190
+ }
1191
+
1192
+ FrtAnalyzer *
1193
+ frb_get_cwrapped_analyzer(VALUE ranalyzer)
1194
+ {
1195
+ FrtAnalyzer *a = NULL;
1196
+ if (frb_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
1197
+ Data_Get_Struct(ranalyzer, FrtAnalyzer, a);
1198
+ FRT_REF(a);
1199
+ }
1200
+ else {
1201
+ a = (FrtAnalyzer *)frt_ecalloc(sizeof(CWrappedAnalyzer));
1202
+ a->destroy_i = &cwa_destroy_i;
1203
+ a->get_ts = &cwa_get_ts;
1204
+ a->ref_cnt = 1;
1205
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1206
+ /* prevent from being garbage collected */
1207
+ rb_hash_aset(object_space, ((VALUE)a)|1, ranalyzer);
1208
+ }
1209
+ return a;
1210
+ }
1211
+
1212
+ static void
1213
+ frb_analyzer_free(FrtAnalyzer *a)
1214
+ {
1215
+ object_del(a);
1216
+ frt_a_deref(a);
1217
+ }
1218
+
1219
+ VALUE
1220
+ frb_get_analyzer(FrtAnalyzer *a)
1221
+ {
1222
+ VALUE self = Qnil;
1223
+ if (a) {
1224
+ self = object_get(a);
1225
+ if (self == Qnil) {
1226
+ self = Data_Wrap_Struct(cAnalyzer, NULL, &frb_analyzer_free, a);
1227
+ FRT_REF(a);
1228
+ object_add(a, self);
1229
+ }
1230
+ }
1231
+ return self;
1232
+ }
1233
+
1234
+ VALUE
1235
+ get_rb_ts_from_a(FrtAnalyzer *a, VALUE rfield, VALUE rstring)
1236
+ {
1237
+ FrtTokenStream *ts = frt_a_get_ts(a, frb_field(rfield), rs2s(rstring));
1238
+
1239
+ /* Make sure that there is no entry already */
1240
+ object_set(&ts->text, rstring);
1241
+ return get_rb_token_stream(ts);
1242
+ }
1243
+
1244
+ /*
1245
+ * call-seq:
1246
+ * analyzer.token_stream(field_name, input) -> token_stream
1247
+ *
1248
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1249
+ * also depend on the +field_name+. Although this parameter is typically
1250
+ * ignored.
1251
+ *
1252
+ * field_name:: name of the field to be tokenized
1253
+ * input:: data from the field to be tokenized
1254
+ */
1255
+ static VALUE
1256
+ frb_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1257
+ {
1258
+ /* NOTE: Any changes made to this method may also need to be applied to
1259
+ * frb_re_analyzer_token_stream */
1260
+ FrtAnalyzer *a;
1261
+ GET_A(a, self);
1262
+
1263
+ StringValue(rstring);
1264
+
1265
+ return get_rb_ts_from_a(a, rfield, rstring);
1266
+ }
1267
+
1268
+ #define GET_LOWER(dflt) \
1269
+ bool lower;\
1270
+ VALUE rlower;\
1271
+ rb_scan_args(argc, argv, "01", &rlower);\
1272
+ lower = (argc ? RTEST(rlower) : dflt)
1273
+
1274
+ /*
1275
+ * call-seq:
1276
+ * AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
1277
+ *
1278
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1279
+ * but can optionally leave case as is. Lowercasing will only be done to
1280
+ * ASCII characters.
1281
+ *
1282
+ * lower:: set to false if you don't want the field's tokens to be downcased
1283
+ */
1284
+ static VALUE
1285
+ frb_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1286
+ {
1287
+ FrtAnalyzer *a;
1288
+ GET_LOWER(false);
1289
+ a = frt_whitespace_analyzer_new(lower);
1290
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1291
+ object_add(a, self);
1292
+ return self;
1293
+ }
1294
+
1295
+ /*
1296
+ * call-seq:
1297
+ * WhiteSpaceAnalyzer.new(lower = false) -> analyzer
1298
+ *
1299
+ * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1300
+ * optionally leave case as is. Lowercasing will be done based on the current
1301
+ * locale.
1302
+ *
1303
+ * lower:: set to false if you don't want the field's tokens to be downcased
1304
+ */
1305
+ static VALUE
1306
+ frb_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1307
+ {
1308
+ FrtAnalyzer *a;
1309
+ GET_LOWER(false);
1310
+ #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
1311
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1312
+ #endif
1313
+ a = frt_mb_whitespace_analyzer_new(lower);
1314
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1315
+ object_add(a, self);
1316
+ return self;
1317
+ }
1318
+
1319
+ /*
1320
+ * call-seq:
1321
+ * AsciiLetterAnalyzer.new(lower = true) -> analyzer
1322
+ *
1323
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1324
+ * but can optionally leave case as is. Lowercasing will only be done to
1325
+ * ASCII characters.
1326
+ *
1327
+ * lower:: set to false if you don't want the field's tokens to be downcased
1328
+ */
1329
+ static VALUE
1330
+ frb_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1331
+ {
1332
+ FrtAnalyzer *a;
1333
+ GET_LOWER(true);
1334
+ a = frt_letter_analyzer_new(lower);
1335
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1336
+ object_add(a, self);
1337
+ return self;
1338
+ }
1339
+
1340
+ /*
1341
+ * call-seq:
1342
+ * LetterAnalyzer.new(lower = true) -> analyzer
1343
+ *
1344
+ * Create a new LetterAnalyzer which downcases tokens by default but can
1345
+ * optionally leave case as is. Lowercasing will be done based on the current
1346
+ * locale.
1347
+ *
1348
+ * lower:: set to false if you don't want the field's tokens to be downcased
1349
+ */
1350
+ static VALUE
1351
+ frb_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1352
+ {
1353
+ FrtAnalyzer *a;
1354
+ GET_LOWER(true);
1355
+ #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
1356
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1357
+ #endif
1358
+ a = frt_mb_letter_analyzer_new(lower);
1359
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1360
+ object_add(a, self);
1361
+ return self;
1362
+ }
1363
+
1364
+ static VALUE
1365
+ get_rstopwords(const char **stop_words)
1366
+ {
1367
+ char **w = (char **)stop_words;
1368
+ VALUE rstopwords = rb_ary_new();
1369
+
1370
+ while (*w) {
1371
+ rb_ary_push(rstopwords, rb_str_new2(*w));
1372
+ w++;
1373
+ }
1374
+ return rstopwords;
1375
+ }
1376
+
1377
+ /*
1378
+ * call-seq:
1379
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = FRT_FULL_ENGLISH_STOP_WORDS)
1380
+ * -> analyzer
1381
+ *
1382
+ * Create a new AsciiStandardAnalyzer which downcases tokens by default but
1383
+ * can optionally leave case as is. Lowercasing will be done based on the
1384
+ * current locale. You can also set the list of stop-words to be used by the
1385
+ * StopFilter.
1386
+ *
1387
+ * lower:: set to false if you don't want the field's tokens to be downcased
1388
+ * stop_words:: list of stop-words to pass to the StopFilter
1389
+ */
1390
+ static VALUE
1391
+ frb_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1392
+ {
1393
+ bool lower;
1394
+ VALUE rlower, rstop_words;
1395
+ FrtAnalyzer *a;
1396
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1397
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1398
+ if (rstop_words != Qnil) {
1399
+ char **stop_words = get_stopwords(rstop_words);
1400
+ a = frt_standard_analyzer_new_with_words((const char **)stop_words, lower);
1401
+ free(stop_words);
1402
+ } else {
1403
+ a = frt_standard_analyzer_new(lower);
1404
+ }
1405
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1406
+ object_add(a, self);
1407
+ return self;
1408
+ }
1409
+
1410
+ /*
1411
+ * call-seq:
1412
+ * StandardAnalyzer.new(stop_words = FRT_FULL_ENGLISH_STOP_WORDS, lower=true)
1413
+ * -> analyzer
1414
+ *
1415
+ * Create a new StandardAnalyzer which downcases tokens by default but can
1416
+ * optionally leave case as is. Lowercasing will be done based on the current
1417
+ * locale. You can also set the list of stop-words to be used by the
1418
+ * StopFilter.
1419
+ *
1420
+ * lower:: set to false if you don't want the field's tokens to be downcased
1421
+ * stop_words:: list of stop-words to pass to the StopFilter
1422
+ */
1423
+ static VALUE
1424
+ frb_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1425
+ {
1426
+ bool lower;
1427
+ VALUE rlower, rstop_words;
1428
+ FrtAnalyzer *a;
1429
+ #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
1430
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1431
+ #endif
1432
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1433
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1434
+ if (rstop_words != Qnil) {
1435
+ char **stop_words = get_stopwords(rstop_words);
1436
+ a = frt_mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
1437
+ free(stop_words);
1438
+ } else {
1439
+ a = frt_mb_standard_analyzer_new(lower);
1440
+ }
1441
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1442
+ object_add(a, self);
1443
+ return self;
1444
+ }
1445
+
1446
+ static void
1447
+ frb_h_mark_values_i(void *key, void *value, void *arg)
1448
+ {
1449
+ frb_gc_mark(value);
1450
+ }
1451
+
1452
+ static void
1453
+ frb_pfa_mark(void *p)
1454
+ {
1455
+ frb_gc_mark(PFA(p)->default_a);
1456
+ frt_h_each(PFA(p)->dict, &frb_h_mark_values_i, NULL);
1457
+ }
1458
+
1459
+ /*** PerFieldAnalyzer ***/
1460
+
1461
+ /*
1462
+ * call-seq:
1463
+ * PerFieldAnalyzer.new(default_analyzer) -> analyzer
1464
+ *
1465
+ * Create a new PerFieldAnalyzer specifying the default analyzer to use on
1466
+ * all fields that are set specifically.
1467
+ *
1468
+ * default_analyzer:: analyzer to be used on fields that aren't otherwise
1469
+ * specified
1470
+ */
1471
+ static VALUE
1472
+ frb_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1473
+ {
1474
+ FrtAnalyzer *def = frb_get_cwrapped_analyzer(ranalyzer);
1475
+ FrtAnalyzer *a = frt_per_field_analyzer_new(def);
1476
+ Frt_Wrap_Struct(self, &frb_pfa_mark, &frb_analyzer_free, a);
1477
+ object_add(a, self);
1478
+ return self;
1479
+ }
1480
+
1481
+ /*
1482
+ * call-seq:
1483
+ * per_field_analyzer.add_field(field_name, default_analyzer) -> self
1484
+ * per_field_analyzer[field_name] = default_analyzer -> self
1485
+ *
1486
+ * Set the analyzer to be used on field +field_name+. Note that field_name
1487
+ * should be a symbol.
1488
+ *
1489
+ * field_name:: field we wish to set the analyzer for
1490
+ * analyzer:: analyzer to be used on +field_name+
1491
+ */
1492
+ static VALUE
1493
+ frb_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1494
+ {
1495
+ FrtAnalyzer *pfa, *a;
1496
+ Data_Get_Struct(self, FrtAnalyzer, pfa);
1497
+ a = frb_get_cwrapped_analyzer(ranalyzer);
1498
+
1499
+ frt_pfa_add_field(pfa, frb_field(rfield), a);
1500
+ return self;
1501
+ }
1502
+
1503
+ /*
1504
+ * call-seq:
1505
+ * analyzer.token_stream(field_name, input) -> token_stream
1506
+ *
1507
+ * Create a new TokenStream to tokenize +input+. The TokenStream created will
1508
+ * also depend on the +field_name+ in the case of the PerFieldAnalyzer.
1509
+ *
1510
+ * field_name:: name of the field to be tokenized
1511
+ * input:: data from the field to be tokenized
1512
+ */
1513
+ static VALUE
1514
+ frb_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1515
+ {
1516
+ FrtAnalyzer *pfa, *a;
1517
+ FrtSymbol field = frb_field(rfield);
1518
+ GET_A(pfa, self);
1519
+
1520
+ StringValue(rstring);
1521
+ a = (FrtAnalyzer *)frt_h_get(PFA(pfa)->dict, (void *)field);
1522
+ if (a == NULL) {
1523
+ a = PFA(pfa)->default_a;
1524
+ }
1525
+ if (a->get_ts == cwa_get_ts) {
1526
+ return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1527
+ rb_str_new_cstr(rb_id2name(field)), rb_str_new_cstr(rs2s(rstring)));
1528
+ }
1529
+ else {
1530
+ return get_rb_ts_from_a(a, rfield, rstring);
1531
+ }
1532
+ }
1533
+
1534
+ /*** RegExpAnalyzer ***/
1535
+
1536
+ static void
1537
+ frb_re_analyzer_mark(FrtAnalyzer *a)
1538
+ {
1539
+ frb_gc_mark(a->current_ts);
1540
+ }
1541
+
1542
+ static void
1543
+ re_analyzer_destroy_i(FrtAnalyzer *a)
1544
+ {
1545
+ frt_ts_deref(a->current_ts);
1546
+ free(a);
1547
+ }
1548
+
1549
+ /*
1550
+ * call-seq:
1551
+ * RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
1552
+ *
1553
+ * Create a new RegExpAnalyzer which will create tokenizers based on the
1554
+ * regular expression and lowercasing if required.
1555
+ *
1556
+ * reg_exp:: the token matcher for the tokenizer to use
1557
+ * lower:: set to false if you don't want to downcase the tokens
1558
+ */
1559
+ static VALUE
1560
+ frb_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1561
+ {
1562
+ VALUE lower, rets, regex, proc;
1563
+ FrtAnalyzer *a;
1564
+ FrtTokenStream *ts;
1565
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1566
+
1567
+ ts = rets_new(Qnil, regex, proc);
1568
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frb_rets_mark, &frb_rets_free, ts);
1569
+ object_add(ts, rets);
1570
+
1571
+ if (lower != Qfalse) {
1572
+ rets = frb_lowercase_filter_init(frb_data_alloc(cLowerCaseFilter), rets);
1573
+ ts = DATA_PTR(rets);
1574
+ }
1575
+ FRT_REF(ts);
1576
+
1577
+ a = frt_analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1578
+ Frt_Wrap_Struct(self, &frb_re_analyzer_mark, &frb_analyzer_free, a);
1579
+ object_add(a, self);
1580
+ return self;
1581
+ }
1582
+
1583
+ /*
1584
+ * call-seq:
1585
+ * analyzer.token_stream(field_name, input) -> token_stream
1586
+ *
1587
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1588
+ * also depend on the +field_name+. Although this parameter is typically
1589
+ * ignored.
1590
+ *
1591
+ * field_name:: name of the field to be tokenized
1592
+ * input:: data from the field to be tokenized
1593
+ */
1594
+ static VALUE
1595
+ frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1596
+ {
1597
+ FrtTokenStream *ts;
1598
+ FrtAnalyzer *a;
1599
+ GET_A(a, self);
1600
+
1601
+ StringValue(rtext);
1602
+
1603
+ ts = frt_a_get_ts(a, frb_field(rfield), rs2s(rtext));
1604
+
1605
+ /* Make sure that there is no entry already */
1606
+ object_set(&ts->text, rtext);
1607
+ if (ts->next == &rets_next) {
1608
+ RETS(ts)->rtext = rtext;
1609
+ rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
1610
+ }
1611
+ else {
1612
+ RETS(((FrtTokenFilter*)ts)->sub_ts)->rtext = rtext;
1613
+ rb_hash_aset(object_space, ((VALUE)((FrtTokenFilter*)ts)->sub_ts)|1, rtext);
1614
+ }
1615
+ return get_rb_token_stream(ts);
1616
+ }
1617
+
1618
+ /****************************************************************************
1619
+ *
1620
+ * Locale stuff
1621
+ *
1622
+ ****************************************************************************/
1623
+
1624
+ /*
1625
+ * call-seq:
1626
+ * Ferret.locale -> locale_str
1627
+ *
1628
+ * Returns a string corresponding to the locale set. For example;
1629
+ *
1630
+ * puts Ferret.locale #=> "en_US.UTF-8"
1631
+ */
1632
+ static VALUE frb_get_locale(VALUE self)
1633
+ {
1634
+ return (frb_locale ? rb_str_new2(frb_locale) : Qnil);
1635
+ }
1636
+
1637
+ /*
1638
+ * call-seq:
1639
+ * Ferret.locale = "en_US.UTF-8"
1640
+ *
1641
+ * Set the global locale. You should use this method to set different locales
1642
+ * when indexing documents with different encodings.
1643
+ */
1644
+ static VALUE frb_set_locale(VALUE self, VALUE locale)
1645
+ {
1646
+ char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
1647
+ frb_locale = setlocale(LC_CTYPE, l);
1648
+ return frb_locale ? rb_str_new2(frb_locale) : Qnil;
1649
+ }
1650
+
1651
+ /****************************************************************************
1652
+ *
1653
+ * Init Functions
1654
+ *
1655
+ ****************************************************************************/
1656
+
1657
+ /*
1658
+ * Document-class: Ferret::Analysis::Token
1659
+ *
1660
+ * == Summary
1661
+ *
1662
+ * A Token is an occurrence of a term from the text of a field. It consists
1663
+ * of a term's text and the start and end offset of the term in the text of
1664
+ * the field;
1665
+ *
1666
+ * The start and end offsets permit applications to re-associate a token with
1667
+ * its source text, e.g., to display highlighted query terms in a document
1668
+ * browser, or to show matching text fragments in a KWIC (KeyWord In Context)
1669
+ * display, etc.
1670
+ *
1671
+ * === Attributes
1672
+ *
1673
+ * text:: the terms text which may have been modified by a Token Filter or
1674
+ * Tokenizer from the text originally found in the document
1675
+ * start:: is the position of the first character corresponding to
1676
+ * this token in the source text
1677
+ * end:: is equal to one greater than the position of the last
1678
+ * character corresponding of this token Note that the
1679
+ * difference between @end_offset and @start_offset may not be
1680
+ * equal to @text.length(), as the term text may have been
1681
+ * altered by a stemmer or some other filter.
1682
+ */
1683
+ static void Init_Token(void)
1684
+ {
1685
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1686
+ rb_define_alloc_func(cToken, frb_token_alloc);
1687
+ rb_include_module(cToken, rb_mComparable);
1688
+
1689
+ rb_define_method(cToken, "initialize", frb_token_init, -1);
1690
+ rb_define_method(cToken, "<=>", frb_token_cmp, 1);
1691
+ rb_define_method(cToken, "text", frb_token_get_text, 0);
1692
+ rb_define_method(cToken, "text=", frb_token_set_text, 1);
1693
+ rb_define_method(cToken, "start", frb_token_get_start_offset, 0);
1694
+ rb_define_method(cToken, "start=", frb_token_set_start_offset, 1);
1695
+ rb_define_method(cToken, "end", frb_token_get_end_offset, 0);
1696
+ rb_define_method(cToken, "end=", frb_token_set_end_offset, 1);
1697
+ rb_define_method(cToken, "pos_inc", frb_token_get_pos_inc, 0);
1698
+ rb_define_method(cToken, "pos_inc=", frb_token_set_pos_inc, 1);
1699
+ rb_define_method(cToken, "to_s", frb_token_to_s, 0);
1700
+ }
1701
+
1702
+ /*
1703
+ * Document-class: Ferret::Analysis::TokenStream
1704
+ *
1705
+ * == Summary
1706
+ *
1707
+ * A TokenStream enumerates the sequence of tokens, either from
1708
+ * fields of a document or from query text.
1709
+ *
1710
+ * This is an abstract class. Concrete subclasses are:
1711
+ *
1712
+ * Tokenizer:: a TokenStream whose input is a string
1713
+ * TokenFilter:: a TokenStream whose input is another TokenStream
1714
+ */
1715
+ static void Init_TokenStream(void)
1716
+ {
1717
+ cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1718
+ frb_mark_cclass(cTokenStream);
1719
+ rb_define_method(cTokenStream, "next", frb_ts_next, 0);
1720
+ rb_define_method(cTokenStream, "text=", frb_ts_set_text, 1);
1721
+ rb_define_method(cTokenStream, "text", frb_ts_get_text, 0);
1722
+ }
1723
+
1724
+ /*
1725
+ * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1726
+ *
1727
+ * == Summary
1728
+ *
1729
+ * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1730
+ * That is to say, it defines tokens as maximal strings of adjacent letters,
1731
+ * as defined by the regular expression _/[A-Za-z]+/_.
1732
+ *
1733
+ * === Example
1734
+ *
1735
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1736
+ * => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
1737
+ */
1738
+ static void Init_AsciiLetterTokenizer(void)
1739
+ {
1740
+ cAsciiLetterTokenizer =
1741
+ rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1742
+ frb_mark_cclass(cAsciiLetterTokenizer);
1743
+ rb_define_alloc_func(cAsciiLetterTokenizer, frb_data_alloc);
1744
+ rb_define_method(cAsciiLetterTokenizer, "initialize",
1745
+ frb_a_letter_tokenizer_init, 1);
1746
+ }
1747
+
1748
+ /*
1749
+ * Document-class: Ferret::Analysis::LetterTokenizer
1750
+ *
1751
+ * == Summary
1752
+ *
1753
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That is
1754
+ * to say, it defines tokens as maximal strings of adjacent letters, as
1755
+ * defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
1756
+ * all characters in your local locale.
1757
+ *
1758
+ * === Example
1759
+ *
1760
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1761
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1762
+ */
1763
+ static void Init_LetterTokenizer(void)
1764
+ {
1765
+ cLetterTokenizer =
1766
+ rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1767
+ frb_mark_cclass(cLetterTokenizer);
1768
+ rb_define_alloc_func(cLetterTokenizer, frb_data_alloc);
1769
+ rb_define_method(cLetterTokenizer, "initialize",
1770
+ frb_letter_tokenizer_init, -1);
1771
+ }
1772
+
1773
+ /*
1774
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1775
+ *
1776
+ * == Summary
1777
+ *
1778
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1779
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1780
+ *
1781
+ * === Example
1782
+ *
1783
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1784
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1785
+ */
1786
+ static void Init_AsciiWhiteSpaceTokenizer(void)
1787
+ {
1788
+ cAsciiWhiteSpaceTokenizer =
1789
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1790
+ cTokenStream);
1791
+ frb_mark_cclass(cAsciiWhiteSpaceTokenizer);
1792
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frb_data_alloc);
1793
+ rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1794
+ frb_a_whitespace_tokenizer_init, 1);
1795
+ }
1796
+
1797
+ /*
1798
+ * Document-class: Ferret::Analysis::WhiteSpaceTokenizer
1799
+ *
1800
+ * == Summary
1801
+ *
1802
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1803
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1804
+ *
1805
+ * === Example
1806
+ *
1807
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1808
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1809
+ */
1810
+ static void Init_WhiteSpaceTokenizer(void)
1811
+ {
1812
+ cWhiteSpaceTokenizer =
1813
+ rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1814
+ frb_mark_cclass(cWhiteSpaceTokenizer);
1815
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frb_data_alloc);
1816
+ rb_define_method(cWhiteSpaceTokenizer, "initialize",
1817
+ frb_whitespace_tokenizer_init, -1);
1818
+ }
1819
+
1820
+ /*
1821
+ * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1822
+ *
1823
+ * == Summary
1824
+ *
1825
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1826
+ * words correctly as well as tokenizing things like email addresses, web
1827
+ * addresses, phone numbers, etc.
1828
+ *
1829
+ * === Example
1830
+ *
1831
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1832
+ * => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
1833
+ */
1834
+ static void Init_AsciiStandardTokenizer(void)
1835
+ {
1836
+ cAsciiStandardTokenizer =
1837
+ rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1838
+ frb_mark_cclass(cAsciiStandardTokenizer);
1839
+ rb_define_alloc_func(cAsciiStandardTokenizer, frb_data_alloc);
1840
+ rb_define_method(cAsciiStandardTokenizer, "initialize",
1841
+ frb_a_standard_tokenizer_init, 1);
1842
+ }
1843
+
1844
+ /*
1845
+ * Document-class: Ferret::Analysis::StandardTokenizer
1846
+ *
1847
+ * == Summary
1848
+ *
1849
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1850
+ * words correctly as well as tokenizing things like email addresses, web
1851
+ * addresses, phone numbers, etc.
1852
+ *
1853
+ * === Example
1854
+ *
1855
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1856
+ * => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
1857
+ */
1858
+ static void Init_StandardTokenizer(void)
1859
+ {
1860
+ cStandardTokenizer =
1861
+ rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1862
+ frb_mark_cclass(cStandardTokenizer);
1863
+ rb_define_alloc_func(cStandardTokenizer, frb_data_alloc);
1864
+ rb_define_method(cStandardTokenizer, "initialize",
1865
+ frb_standard_tokenizer_init, 1);
1866
+ }
1867
+
1868
+ /*
1869
+ * Document-class: Ferret::Analysis::RegExpTokenizer
1870
+ *
1871
+ * == Summary
1872
+ *
1873
+ * A tokenizer that recognizes tokens based on a regular expression passed to
1874
+ * the constructor. Most possible tokenizers can be created using this class.
1875
+ *
1876
+ * === Example
1877
+ *
1878
+ * Below is an example of a simple implementation of a LetterTokenizer using
1879
+ * an RegExpTokenizer. Basically, a token is a sequence of alphabetic
1880
+ * characters separated by one or more non-alphabetic characters.
1881
+ *
1882
+ * # of course you would add more than just é
1883
+ * RegExpTokenizer.new(input, /[[:alpha:]é]+/)
1884
+ *
1885
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1886
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1887
+ */
1888
+ static void Init_RegExpTokenizer(void)
1889
+ {
1890
+ cRegExpTokenizer =
1891
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1892
+ frb_mark_cclass(cRegExpTokenizer);
1893
+ rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1894
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1895
+ rb_define_alloc_func(cRegExpTokenizer, frb_data_alloc);
1896
+ rb_define_method(cRegExpTokenizer, "initialize",
1897
+ frb_rets_init, -1);
1898
+ rb_define_method(cRegExpTokenizer, "text=", frb_rets_set_text, 1);
1899
+ rb_define_method(cRegExpTokenizer, "text", frb_rets_get_text, 0);
1900
+ }
1901
+
1902
+ /***************/
1903
+ /*** Filters ***/
1904
+ /***************/
1905
+
1906
+ /*
1907
+ * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1908
+ *
1909
+ * == Summary
1910
+ *
1911
+ * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1912
+ * ASCII characters. For other characters use LowerCaseFilter.
1913
+ *
1914
+ * === Example
1915
+ *
1916
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
1917
+ *
1918
+ */
1919
+ static void Init_AsciiLowerCaseFilter(void)
1920
+ {
1921
+ cAsciiLowerCaseFilter =
1922
+ rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1923
+ frb_mark_cclass(cAsciiLowerCaseFilter);
1924
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frb_data_alloc);
1925
+ rb_define_method(cAsciiLowerCaseFilter, "initialize",
1926
+ frb_a_lowercase_filter_init, 1);
1927
+ }
1928
+
1929
+ /*
1930
+ * Document-class: Ferret::Analysis::LowerCaseFilter
1931
+ *
1932
+ * == Summary
1933
+ *
1934
+ * LowerCaseFilter normalizes a token's text to lowercase based on the
1935
+ * current locale.
1936
+ *
1937
+ * === Example
1938
+ *
1939
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
1940
+ *
1941
+ */
1942
+ static void Init_LowerCaseFilter(void)
1943
+ {
1944
+ cLowerCaseFilter =
1945
+ rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1946
+ frb_mark_cclass(cLowerCaseFilter);
1947
+ rb_define_alloc_func(cLowerCaseFilter, frb_data_alloc);
1948
+ rb_define_method(cLowerCaseFilter, "initialize",
1949
+ frb_lowercase_filter_init, 1);
1950
+ }
1951
+
1952
+ /*
1953
+ * Document-class: Ferret::Analysis::HyphenFilter
1954
+ *
1955
+ * == Summary
1956
+ *
1957
+ * HyphenFilter filters hyphenated words by adding both the word concatenated
1958
+ * into a single word and split into multiple words. ie "e-mail" becomes
1959
+ * "email" and "e mail". This way a search for "e-mail", "email" and "mail"
1960
+ * will all match. This filter is used by default by the StandardAnalyzer.
1961
+ *
1962
+ * === Example
1963
+ *
1964
+ * ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
1965
+ *
1966
+ */
1967
+ static void Init_HyphenFilter(void)
1968
+ {
1969
+ cHyphenFilter =
1970
+ rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
1971
+ frb_mark_cclass(cHyphenFilter);
1972
+ rb_define_alloc_func(cHyphenFilter, frb_data_alloc);
1973
+ rb_define_method(cHyphenFilter, "initialize", frb_hyphen_filter_init, 1);
1974
+ }
1975
+
1976
+ /*
1977
+ * Document-class: Ferret::Analysis::MappingFilter
1978
+ *
1979
+ * == Summary
1980
+ *
1981
+ * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
1982
+ * characters to ASCII characters for easier searching and better search
1983
+ * recall. The mapping is compiled into a Deterministic Finite Automata so it
1984
+ * is super fast. This Filter can therefor be used for indexing very large
1985
+ * datasets. Currently regular expressions are not supported. If you are
1986
+ * really interested in the feature, please contact me at dbalmain@gmail.com.
1987
+ *
1988
+ * == Example
1989
+ *
1990
+ * mapping = {
1991
+ * ['à','á','â','ã','ä','å','ā','ă'] => 'a',
1992
+ * 'æ' => 'ae',
1993
+ * ['ď','đ'] => 'd',
1994
+ * ['ç','ć','č','ĉ','ċ'] => 'c',
1995
+ * ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
1996
+ * ['ƒ'] => 'f',
1997
+ * ['ĝ','ğ','ġ','ģ'] => 'g',
1998
+ * ['ĥ','ħ'] => 'h',
1999
+ * ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
2000
+ * ['į','ı','ij','ĵ'] => 'j',
2001
+ * ['ķ','ĸ'] => 'k',
2002
+ * ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
2003
+ * ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
2004
+ * ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
2005
+ * ['œ'] => 'oek',
2006
+ * ['ą'] => 'q',
2007
+ * ['ŕ','ř','ŗ'] => 'r',
2008
+ * ['ś','š','ş','ŝ','ș'] => 's',
2009
+ * ['ť','ţ','ŧ','ț'] => 't',
2010
+ * ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
2011
+ * ['ŵ'] => 'w',
2012
+ * ['ý','ÿ','ŷ'] => 'y',
2013
+ * ['ž','ż','ź'] => 'z'
2014
+ * }
2015
+ * filt = MappingFilter.new(token_stream, mapping)
2016
+ */
2017
+ static void Init_MappingFilter(void)
2018
+ {
2019
+ cMappingFilter =
2020
+ rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
2021
+ frb_mark_cclass(cMappingFilter);
2022
+ rb_define_alloc_func(cMappingFilter, frb_data_alloc);
2023
+ rb_define_method(cMappingFilter, "initialize",
2024
+ frb_mapping_filter_init, 2);
2025
+ }
2026
+
2027
+ /*
2028
+ * Document-class: Ferret::Analysis::StopFilter
2029
+ *
2030
+ * == Summary
2031
+ *
2032
+ * A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
2033
+ * that you don't wish to be index. Usually they will be common words like
2034
+ * "the" and "and" although you can specify whichever words you want.
2035
+ *
2036
+ * === Example
2037
+ *
2038
+ * ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
2039
+ */
2040
+ static void Init_StopFilter(void)
2041
+ {
2042
+ cStopFilter =
2043
+ rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
2044
+ frb_mark_cclass(cStopFilter);
2045
+ rb_define_alloc_func(cStopFilter, frb_data_alloc);
2046
+ rb_define_method(cStopFilter, "initialize",
2047
+ frb_stop_filter_init, -1);
2048
+ }
2049
+
2050
+ /*
2051
+ * Document-class: Ferret::Analysis::StemFilter
2052
+ *
2053
+ * == Summary
2054
+ *
2055
+ * A StemFilter takes a term and transforms the term as per the SnowBall
2056
+ * stemming algorithm. Note: the input to the stemming filter must already
2057
+ * be in lower case, so you will need to use LowerCaseFilter or lowercasing
2058
+ * Tokenizer further down the Tokenizer chain in order for this to work
2059
+ * properly!
2060
+ *
2061
+ * === Available algorithms and encodings
2062
+ *
2063
+ * Algorithm Algorithm Pseudonyms Encoding
2064
+ * ----------------------------------------------------------------
2065
+ * "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
2066
+ * "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
2067
+ * "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
2068
+ * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
2069
+ * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
2070
+ * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
2071
+ * "hungarian", | "hu", "hun" | "ISO_8859_1", "UTF_8"
2072
+ * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
2073
+ * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
2074
+ * "porter", | | "ISO_8859_1", "UTF_8"
2075
+ * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
2076
+ * "romanian", | "ro", "ron", "rum" | "ISO_8859_2", "UTF_8"
2077
+ * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
2078
+ * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
2079
+ * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
2080
+ * "turkish", | "tr", "tur" | "UTF_8"
2081
+ *
2082
+ *
2083
+ * === New Stemmers
2084
+ *
2085
+ * The following stemmers have recently benn added. Please try them out;
2086
+ *
2087
+ * * Hungarian
2088
+ * * Romanian
2089
+ * * Turkish
2090
+ *
2091
+ * === Example
2092
+ *
2093
+ * To use this filter with other analyzers, you'll want to write an Analyzer
2094
+ * class that sets up the TokenStream chain as you want it. To use this with
2095
+ * a lowercasing Tokenizer, for example, you'd write an analyzer like this:
2096
+ *
2097
+ * def MyAnalyzer < Analyzer
2098
+ * def token_stream(field, str)
2099
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2100
+ * end
2101
+ * end
2102
+ *
2103
+ * "debate debates debated debating debater"
2104
+ * => ["debat", "debat", "debat", "debat", "debat"]
2105
+ *
2106
+ * === Attributes
2107
+ *
2108
+ * token_stream:: TokenStream to be filtered
2109
+ * algorithm:: The algorithm (or language) to use (default: "english")
2110
+ * encoding:: The encoding of the data (default: "UTF-8")
2111
+ */
2112
+ static void Init_StemFilter(void)
2113
+ {
2114
+ cStemFilter =
2115
+ rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
2116
+ frb_mark_cclass(cStemFilter);
2117
+ rb_define_alloc_func(cStemFilter, frb_data_alloc);
2118
+ rb_define_method(cStemFilter, "initialize",
2119
+ frb_stem_filter_init, -1);
2120
+ }
2121
+
2122
+ /*************************/
2123
+ /*** * * Analyzers * * ***/
2124
+ /*************************/
2125
+
2126
+ /*
2127
+ * Document-class: Ferret::Analysis::Analyzer
2128
+ *
2129
+ * == Summary
2130
+ *
2131
+ * An FrtAnalyzer builds TokenStreams, which analyze text. It thus represents
2132
+ * a policy for extracting index terms from text.
2133
+ *
2134
+ * Typical implementations first build a Tokenizer, which breaks the stream
2135
+ * of characters from the Reader into raw Tokens. One or more TokenFilters
2136
+ * may then be applied to the output of the Tokenizer.
2137
+ *
2138
+ * The default FrtAnalyzer just creates a LowerCaseTokenizer which converts
2139
+ * all text to lowercase tokens. See LowerCaseTokenizer for more details.
2140
+ *
2141
+ * === Example
2142
+ *
2143
+ * To create your own custom FrtAnalyzer you simply need to implement a
2144
+ * token_stream method which takes the field name and the data to be
2145
+ * tokenized as parameters and returns a TokenStream. Most analyzers
2146
+ * typically ignore the field name.
2147
+ *
2148
+ * Here we'll create a StemmingAnalyzer;
2149
+ *
2150
+ * def MyAnalyzer < Analyzer
2151
+ * def token_stream(field, str)
2152
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
2153
+ * end
2154
+ * end
2155
+ */
2156
+ static void Init_Analyzer(void)
2157
+ {
2158
+ cAnalyzer =
2159
+ rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
2160
+ frb_mark_cclass(cAnalyzer);
2161
+ rb_define_alloc_func(cAnalyzer, frb_data_alloc);
2162
+ rb_define_method(cAnalyzer, "initialize", frb_letter_analyzer_init, -1);
2163
+ rb_define_method(cAnalyzer, "token_stream", frb_analyzer_token_stream, 2);
2164
+ }
2165
+
2166
+ /*
2167
+ * Document-class: Ferret::Analysis::AsciiLetterAnalyzer
2168
+ *
2169
+ * == Summary
2170
+ *
2171
+ * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
2172
+ * maximal strings of ASCII characters. If implemented in Ruby it would look
2173
+ * like;
2174
+ *
2175
+ * class AsciiLetterAnalyzer
2176
+ * def initialize(lower = true)
2177
+ * @lower = lower
2178
+ * end
2179
+ *
2180
+ * def token_stream(field, str)
2181
+ * if @lower
2182
+ * return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
2183
+ * else
2184
+ * return AsciiLetterTokenizer.new(str)
2185
+ * end
2186
+ * end
2187
+ * end
2188
+ *
2189
+ * As you can see it makes use of the AsciiLetterTokenizer and
2190
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
2191
+ * characters so you should use the LetterAnalyzer is you want to analyze
2192
+ * multi-byte data like "UTF-8".
2193
+ */
2194
+ static void Init_AsciiLetterAnalyzer(void)
2195
+ {
2196
+ cAsciiLetterAnalyzer =
2197
+ rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
2198
+ frb_mark_cclass(cAsciiLetterAnalyzer);
2199
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frb_data_alloc);
2200
+ rb_define_method(cAsciiLetterAnalyzer, "initialize",
2201
+ frb_a_letter_analyzer_init, -1);
2202
+ }
2203
+
2204
+ /*
2205
+ * Document-class: Ferret::Analysis::LetterAnalyzer
2206
+ *
2207
+ * == Summary
2208
+ *
2209
+ * A LetterAnalyzer creates a TokenStream that splits the input up into
2210
+ * maximal strings of characters as recognized by the current locale. If
2211
+ * implemented in Ruby it would look like;
2212
+ *
2213
+ * class LetterAnalyzer
2214
+ * def initialize(lower = true)
2215
+ * @lower = lower
2216
+ * end
2217
+ *
2218
+ * def token_stream(field, str)
2219
+ * return LetterTokenizer.new(str, @lower)
2220
+ * end
2221
+ * end
2222
+ *
2223
+ * As you can see it makes use of the LetterTokenizer.
2224
+ */
2225
+ static void Init_LetterAnalyzer(void)
2226
+ {
2227
+ cLetterAnalyzer =
2228
+ rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
2229
+ frb_mark_cclass(cLetterAnalyzer);
2230
+ rb_define_alloc_func(cLetterAnalyzer, frb_data_alloc);
2231
+ rb_define_method(cLetterAnalyzer, "initialize",
2232
+ frb_letter_analyzer_init, -1);
2233
+ }
2234
+
2235
+ /*
2236
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
2237
+ *
2238
+ * == Summary
2239
+ *
2240
+ * The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
2241
+ * non-whitespace characters. If implemented in Ruby the
2242
+ * AsciiWhiteSpaceAnalyzer would look like;
2243
+ *
2244
+ * class AsciiWhiteSpaceAnalyzer
2245
+ * def initialize(lower = true)
2246
+ * @lower = lower
2247
+ * end
2248
+ *
2249
+ * def token_stream(field, str)
2250
+ * if @lower
2251
+ * return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
2252
+ * else
2253
+ * return AsciiWhiteSpaceTokenizer.new(str)
2254
+ * end
2255
+ * end
2256
+ * end
2257
+ *
2258
+ * As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
2259
+ * use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
2260
+ * as "UTF-8".
2261
+ */
2262
+ static void Init_AsciiWhiteSpaceAnalyzer(void)
2263
+ {
2264
+ cAsciiWhiteSpaceAnalyzer =
2265
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
2266
+ frb_mark_cclass(cAsciiWhiteSpaceAnalyzer);
2267
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frb_data_alloc);
2268
+ rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
2269
+ frb_a_white_space_analyzer_init, -1);
2270
+ }
2271
+
2272
+ /*
2273
+ * Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
2274
+ *
2275
+ * == Summary
2276
+ *
2277
+ * The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
2278
+ * non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
2279
+ * would look like;
2280
+ *
2281
+ * class WhiteSpaceAnalyzer
2282
+ * def initialize(lower = true)
2283
+ * @lower = lower
2284
+ * end
2285
+ *
2286
+ * def token_stream(field, str)
2287
+ * return WhiteSpaceTokenizer.new(str, @lower)
2288
+ * end
2289
+ * end
2290
+ *
2291
+ * As you can see it makes use of the WhiteSpaceTokenizer.
2292
+ */
2293
+ static void Init_WhiteSpaceAnalyzer(void)
2294
+ {
2295
+ cWhiteSpaceAnalyzer =
2296
+ rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
2297
+ frb_mark_cclass(cWhiteSpaceAnalyzer);
2298
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frb_data_alloc);
2299
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize",
2300
+ frb_white_space_analyzer_init, -1);
2301
+ }
2302
+
2303
+ /*
2304
+ * Document-class: Ferret::Analysis::AsciiStandardAnalyzer
2305
+ *
2306
+ * == Summary
2307
+ *
2308
+ * The AsciiStandardAnalyzer is the most advanced of the available
2309
+ * ASCII-analyzers. If it were implemented in Ruby it would look like this;
2310
+ *
2311
+ * class AsciiStandardAnalyzer
2312
+ * def initialize(stop_words = FRT_FULL_ENGLISH_STOP_WORDS, lower = true)
2313
+ * @lower = lower
2314
+ * @stop_words = stop_words
2315
+ * end
2316
+ *
2317
+ * def token_stream(field, str)
2318
+ * ts = AsciiStandardTokenizer.new(str)
2319
+ * ts = AsciiLowerCaseFilter.new(ts) if @lower
2320
+ * ts = StopFilter.new(ts, @stop_words)
2321
+ * ts = HyphenFilter.new(ts)
2322
+ * end
2323
+ * end
2324
+ *
2325
+ * As you can see it makes use of the AsciiStandardTokenizer and you can also
2326
+ * add your own list of stop-words if you wish. Note that this tokenizer
2327
+ * won't recognize non-ASCII characters so you should use the
2328
+ * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
2329
+ */
2330
+ static void Init_AsciiStandardAnalyzer(void)
2331
+ {
2332
+ cAsciiStandardAnalyzer =
2333
+ rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2334
+ frb_mark_cclass(cAsciiStandardAnalyzer);
2335
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frb_data_alloc);
2336
+ rb_define_method(cAsciiStandardAnalyzer, "initialize",
2337
+ frb_a_standard_analyzer_init, -1);
2338
+ }
2339
+
2340
+ /*
2341
+ * Document-class: Ferret::Analysis::StandardAnalyzer
2342
+ *
2343
+ * == Summary
2344
+ *
2345
+ * The StandardAnalyzer is the most advanced of the available analyzers. If
2346
+ * it were implemented in Ruby it would look like this;
2347
+ *
2348
+ * class StandardAnalyzer
2349
+ * def initialize(stop_words = FRT_FULL_ENGLISH_STOP_WORDS, lower = true)
2350
+ * @lower = lower
2351
+ * @stop_words = stop_words
2352
+ * end
2353
+ *
2354
+ * def token_stream(field, str)
2355
+ * ts = StandardTokenizer.new(str)
2356
+ * ts = LowerCaseFilter.new(ts) if @lower
2357
+ * ts = StopFilter.new(ts, @stop_words)
2358
+ * ts = HyphenFilter.new(ts)
2359
+ * end
2360
+ * end
2361
+ *
2362
+ * As you can see it makes use of the StandardTokenizer and you can also add
2363
+ * your own list of stopwords if you wish.
2364
+ */
2365
+ static void Init_StandardAnalyzer(void)
2366
+ {
2367
+ cStandardAnalyzer =
2368
+ rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2369
+ frb_mark_cclass(cStandardAnalyzer);
2370
+ rb_define_alloc_func(cStandardAnalyzer, frb_data_alloc);
2371
+ rb_define_method(cStandardAnalyzer, "initialize",
2372
+ frb_standard_analyzer_init, -1);
2373
+ }
2374
+
2375
+ /*
2376
+ * Document-class: Ferret::Analysis::PerFieldAnalyzer
2377
+ *
2378
+ * == Summary
2379
+ *
2380
+ * The PerFieldAnalyzer is for use when you want to analyze different fields
2381
+ * with different analyzers. With the PerFieldAnalyzer you can specify how
2382
+ * you want each field analyzed.
2383
+ *
2384
+ * === Example
2385
+ *
2386
+ * # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
2387
+ * pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
2388
+ *
2389
+ * # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
2390
+ * pfa[:title] = WhiteSpaceAnalyzer.new(false)
2391
+ *
2392
+ * # Use a custom analyzer on the :created_at field
2393
+ * pfa[:created_at] = DateAnalyzer.new
2394
+ */
2395
+ static void Init_PerFieldAnalyzer(void)
2396
+ {
2397
+ cPerFieldAnalyzer =
2398
+ rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2399
+ frb_mark_cclass(cPerFieldAnalyzer);
2400
+ rb_define_alloc_func(cPerFieldAnalyzer, frb_data_alloc);
2401
+ rb_define_method(cPerFieldAnalyzer, "initialize",
2402
+ frb_per_field_analyzer_init, 1);
2403
+ rb_define_method(cPerFieldAnalyzer, "add_field",
2404
+ frb_per_field_analyzer_add_field, 2);
2405
+ rb_define_method(cPerFieldAnalyzer, "[]=",
2406
+ frb_per_field_analyzer_add_field, 2);
2407
+ rb_define_method(cPerFieldAnalyzer, "token_stream",
2408
+ frb_pfa_analyzer_token_stream, 2);
2409
+ }
2410
+
2411
+ /*
2412
+ * Document-class: Ferret::Analysis::RegExpAnalyzer
2413
+ *
2414
+ * == Summary
2415
+ *
2416
+ * Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
2417
+ * implemented in Ruby it would look like this;
2418
+ *
2419
+ * class RegExpAnalyzer
2420
+ * def initialize(reg_exp, lower = true)
2421
+ * @lower = lower
2422
+ * @reg_exp = reg_exp
2423
+ * end
2424
+ *
2425
+ * def token_stream(field, str)
2426
+ * if @lower
2427
+ * return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
2428
+ * else
2429
+ * return RegExpTokenizer.new(str, reg_exp)
2430
+ * end
2431
+ * end
2432
+ * end
2433
+ *
2434
+ * === Example
2435
+ *
2436
+ * csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
2437
+ */
2438
+ static void Init_RegExpAnalyzer(void)
2439
+ {
2440
+ cRegExpAnalyzer =
2441
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2442
+ frb_mark_cclass(cRegExpAnalyzer);
2443
+ rb_define_alloc_func(cRegExpAnalyzer, frb_data_alloc);
2444
+ rb_define_method(cRegExpAnalyzer, "initialize",
2445
+ frb_re_analyzer_init, -1);
2446
+ rb_define_method(cRegExpAnalyzer, "token_stream",
2447
+ frb_re_analyzer_token_stream, 2);
2448
+ }
2449
+
2450
+ /* rdoc hack
2451
+ extern VALUE mFerret = rb_define_module("Ferret");
2452
+ */
2453
+
2454
+ /*
2455
+ * Document-module: Ferret::Analysis
2456
+ *
2457
+ * == Summary
2458
+ *
2459
+ * The Analysis module contains all the classes used to analyze and tokenize
2460
+ * the data to be indexed. There are three main classes you need to know
2461
+ * about when dealing with analysis; Analyzer, TokenStream and Token.
2462
+ *
2463
+ * == Classes
2464
+ *
2465
+ * === Analyzer
2466
+ *
2467
+ * Analyzers handle all of your tokenizing needs. You pass an FrtAnalyzer to the
2468
+ * indexing class when you create it and it will create the TokenStreams
2469
+ * necessary to tokenize the fields in the documents. Most of the time you
2470
+ * won't need to worry about TokenStreams and Tokens, one of the Analyzers
2471
+ * distributed with Ferret will do exactly what you need. Otherwise you'll
2472
+ * need to implement a custom analyzer.
2473
+ *
2474
+ * === TokenStream
2475
+ *
2476
+ * A TokenStream is an enumeration of Tokens. There are two standard types of
2477
+ * TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
2478
+ * turns it into a list of Tokens. A TokenFilter takes another TokenStream
2479
+ * and post-processes the Tokens. You can chain as many TokenFilters together
2480
+ * as you like but they always need to finish with a Tokenizer.
2481
+ *
2482
+ * === Token
2483
+ *
2484
+ * A Token is a single term from a document field. A token contains the text
2485
+ * representing the term as well as the start and end offset of the token.
2486
+ * The start and end offset will represent the token as it appears in the
2487
+ * source field. Some TokenFilters may change the text in the Token but the
2488
+ * start and end offsets should stay the same so (end - start) won't
2489
+ * necessarily be equal to the length of text in the token. For example using
2490
+ * a stemming TokenFilter the term "Beginning" might have start and end
2491
+ * offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
2492
+ * might be "begin" (after stemming).
2493
+ */
2494
+ void
2495
+ Init_Analysis(void)
2496
+ {
2497
+ mAnalysis = rb_define_module_under(mFerret, "Analysis");
2498
+
2499
+ /* TokenStream Methods */
2500
+ id_next = rb_intern("next");
2501
+ id_reset = rb_intern("text=");
2502
+ id_clone = rb_intern("clone");
2503
+ id_text = rb_intern("@text");
2504
+
2505
+ /* FrtAnalyzer Methods */
2506
+ id_token_stream = rb_intern("token_stream");
2507
+
2508
+ object_space = rb_hash_new();
2509
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2510
+
2511
+ /*** * * Locale stuff * * ***/
2512
+ rb_define_singleton_method(mFerret, "locale=", frb_set_locale, 1);
2513
+ rb_define_singleton_method(mFerret, "locale", frb_get_locale, 0);
2514
+
2515
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2516
+ get_rstopwords(FRT_ENGLISH_STOP_WORDS));
2517
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
2518
+ get_rstopwords(FRT_FULL_ENGLISH_STOP_WORDS));
2519
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
2520
+ get_rstopwords(FRT_EXTENDED_ENGLISH_STOP_WORDS));
2521
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
2522
+ get_rstopwords(FRT_FULL_FRENCH_STOP_WORDS));
2523
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
2524
+ get_rstopwords(FRT_FULL_SPANISH_STOP_WORDS));
2525
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
2526
+ get_rstopwords(FRT_FULL_PORTUGUESE_STOP_WORDS));
2527
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
2528
+ get_rstopwords(FRT_FULL_ITALIAN_STOP_WORDS));
2529
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
2530
+ get_rstopwords(FRT_FULL_GERMAN_STOP_WORDS));
2531
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
2532
+ get_rstopwords(FRT_FULL_DUTCH_STOP_WORDS));
2533
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
2534
+ get_rstopwords(FRT_FULL_SWEDISH_STOP_WORDS));
2535
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
2536
+ get_rstopwords(FRT_FULL_NORWEGIAN_STOP_WORDS));
2537
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
2538
+ get_rstopwords(FRT_FULL_DANISH_STOP_WORDS));
2539
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
2540
+ get_rstopwords(FRT_FULL_RUSSIAN_STOP_WORDS));
2541
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2542
+ get_rstopwords(FRT_FULL_FINNISH_STOP_WORDS));
2543
+ rb_define_const(mAnalysis, "FULL_HUNGARIAN_STOP_WORDS",
2544
+ get_rstopwords(FRT_FULL_HUNGARIAN_STOP_WORDS));
2545
+
2546
+ Init_Token();
2547
+ Init_TokenStream();
2548
+
2549
+ Init_AsciiLetterTokenizer();
2550
+ Init_LetterTokenizer();
2551
+
2552
+ Init_AsciiWhiteSpaceTokenizer();
2553
+ Init_WhiteSpaceTokenizer();
2554
+
2555
+ Init_AsciiStandardTokenizer();
2556
+ Init_StandardTokenizer();
2557
+
2558
+ Init_RegExpTokenizer();
2559
+
2560
+ Init_AsciiLowerCaseFilter();
2561
+ Init_LowerCaseFilter();
2562
+ Init_HyphenFilter();
2563
+ Init_StopFilter();
2564
+ Init_MappingFilter();
2565
+ Init_StemFilter();
2566
+
2567
+ Init_Analyzer();
2568
+ Init_AsciiLetterAnalyzer();
2569
+ Init_LetterAnalyzer();
2570
+ Init_AsciiWhiteSpaceAnalyzer();
2571
+ Init_WhiteSpaceAnalyzer();
2572
+ Init_AsciiStandardAnalyzer();
2573
+ Init_StandardAnalyzer();
2574
+ Init_PerFieldAnalyzer();
2575
+ Init_RegExpAnalyzer();
2576
+
2577
+ }