jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
@@ -0,0 +1,653 @@
1
+ # encoding: utf-8
2
+
3
+ require File.dirname(__FILE__) + "/../../test_helper"
4
+
5
+ puts "Loading once"
6
+ class TokenTest < Test::Unit::TestCase
7
+ include Ferret::Analysis
8
+ def test_token
9
+ t = Token.new("text", 1, 2, 3)
10
+ assert_equal("text", t.text)
11
+ assert_equal(1, t.start)
12
+ assert_equal(2, t.end)
13
+ assert_equal(3, t.pos_inc)
14
+ t.text = "yada yada yada"
15
+ t.start = 11
16
+ t.end = 12
17
+ t.pos_inc = 13
18
+ assert_equal("yada yada yada", t.text)
19
+ assert_equal(11, t.start)
20
+ assert_equal(12, t.end)
21
+ assert_equal(13, t.pos_inc)
22
+
23
+ t = Token.new("text", 1, 2)
24
+ assert_equal(1, t.pos_inc)
25
+ end
26
+ end
27
+
28
+ class AsciiLetterTokenizerTest < Test::Unit::TestCase
29
+ include Ferret::Analysis
30
+
31
+ def test_letter_tokenizer()
32
+ input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#!$'
33
+ t = AsciiLetterTokenizer.new(input)
34
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
35
+ assert_equal(Token.new("gmail", 9, 14), t.next())
36
+ assert_equal(Token.new("com", 15, 18), t.next())
37
+ assert_equal(Token.new("is", 19, 21), t.next())
38
+ assert_equal(Token.new("My", 22, 24), t.next())
39
+ assert_equal(Token.new("e", 25, 26), t.next())
40
+ assert_equal(Token.new("mail", 27, 31), t.next())
41
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
42
+ assert(! t.next())
43
+ t.text = "one_two three"
44
+ assert_equal(Token.new("one", 0, 3), t.next())
45
+ assert_equal(Token.new("two", 4, 7), t.next())
46
+ assert_equal(Token.new("three", 8, 13), t.next())
47
+ assert(! t.next())
48
+ t = AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(input))
49
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
50
+ assert_equal(Token.new("gmail", 9, 14), t.next())
51
+ assert_equal(Token.new("com", 15, 18), t.next())
52
+ assert_equal(Token.new("is", 19, 21), t.next())
53
+ assert_equal(Token.new("my", 22, 24), t.next())
54
+ assert_equal(Token.new("e", 25, 26), t.next())
55
+ assert_equal(Token.new("mail", 27, 31), t.next())
56
+ assert_equal(Token.new("address", 39, 46), t.next())
57
+ assert(! t.next())
58
+ end
59
+ end
60
+
61
+ class LetterTokenizerTest < Test::Unit::TestCase
62
+ include Ferret::Analysis
63
+
64
+ def test_letter_tokenizer()
65
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
66
+ t = LetterTokenizer.new(input)
67
+ assert_equal(Token.new('DBalmän', 0, 8), t.next)
68
+ assert_equal(Token.new('gmail', 9, 14), t.next)
69
+ assert_equal(Token.new('com', 15, 18), t.next)
70
+ assert_equal(Token.new('is', 19, 21), t.next)
71
+ assert_equal(Token.new('My', 22, 24), t.next)
72
+ assert_equal(Token.new('e', 25, 26), t.next)
73
+ assert_equal(Token.new('mail', 27, 31), t.next)
74
+ assert_equal(Token.new('address', 40, 47), t.next)
75
+ assert_equal(Token.new('ÁÄGÇ', 55, 62), t.next)
76
+ assert_equal(Token.new('ÊËÌ', 64, 70), t.next)
77
+ assert_equal(Token.new('ÚØÃ', 72, 78), t.next)
78
+ assert_equal(Token.new('ÖÎÍ', 80, 86), t.next)
79
+ assert(! t.next())
80
+ t.text = "one_two three"
81
+ assert_equal(Token.new("one", 0, 3), t.next())
82
+ assert_equal(Token.new("two", 4, 7), t.next())
83
+ assert_equal(Token.new("three", 8, 13), t.next())
84
+ assert(! t.next())
85
+ t = LowerCaseFilter.new(LetterTokenizer.new(input))
86
+ assert_equal(Token.new('dbalmän', 0, 8), t.next)
87
+ assert_equal(Token.new('gmail', 9, 14), t.next)
88
+ assert_equal(Token.new('com', 15, 18), t.next)
89
+ assert_equal(Token.new('is', 19, 21), t.next)
90
+ assert_equal(Token.new('my', 22, 24), t.next)
91
+ assert_equal(Token.new('e', 25, 26), t.next)
92
+ assert_equal(Token.new('mail', 27, 31), t.next)
93
+ assert_equal(Token.new('address', 40, 47), t.next)
94
+ assert_equal(Token.new('áägç', 55, 62), t.next)
95
+ assert_equal(Token.new('êëì', 64, 70), t.next)
96
+ assert_equal(Token.new('úøã', 72, 78), t.next)
97
+ assert_equal(Token.new('öîí', 80, 86), t.next)
98
+ assert(! t.next())
99
+ t = LetterTokenizer.new(input, true)
100
+ assert_equal(Token.new('dbalmän', 0, 8), t.next)
101
+ assert_equal(Token.new('gmail', 9, 14), t.next)
102
+ assert_equal(Token.new('com', 15, 18), t.next)
103
+ assert_equal(Token.new('is', 19, 21), t.next)
104
+ assert_equal(Token.new('my', 22, 24), t.next)
105
+ assert_equal(Token.new('e', 25, 26), t.next)
106
+ assert_equal(Token.new('mail', 27, 31), t.next)
107
+ assert_equal(Token.new('address', 40, 47), t.next)
108
+ assert_equal(Token.new('áägç', 55, 62), t.next)
109
+ assert_equal(Token.new('êëì', 64, 70), t.next)
110
+ assert_equal(Token.new('úøã', 72, 78), t.next)
111
+ assert_equal(Token.new('öîí', 80, 86), t.next)
112
+ assert(! t.next())
113
+ end
114
+ end if (/utf-8/i =~ Ferret.locale)
115
+
116
+ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
117
+ include Ferret::Analysis
118
+
119
+ def test_whitespace_tokenizer()
120
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ ADDRESS. 23#!$'
121
+ t = AsciiWhiteSpaceTokenizer.new(input)
122
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
123
+ assert_equal(Token.new('is', 19, 21), t.next)
124
+ assert_equal(Token.new('My', 22, 24), t.next)
125
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
126
+ assert_equal(Token.new('52', 32, 34), t.next)
127
+ assert_equal(Token.new('#$', 37, 39), t.next)
128
+ assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
129
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
130
+ assert(! t.next())
131
+ t.text = "one_two three"
132
+ assert_equal(Token.new("one_two", 0, 7), t.next())
133
+ assert_equal(Token.new("three", 8, 13), t.next())
134
+ assert(! t.next())
135
+ t = AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(input))
136
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
137
+ assert_equal(Token.new('is', 19, 21), t.next)
138
+ assert_equal(Token.new('my', 22, 24), t.next)
139
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
140
+ assert_equal(Token.new('52', 32, 34), t.next)
141
+ assert_equal(Token.new('#$', 37, 39), t.next)
142
+ assert_equal(Token.new('address.', 40, 48), t.next)
143
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
144
+ assert(! t.next())
145
+ end
146
+ end
147
+
148
+ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
149
+ include Ferret::Analysis
150
+
151
+ def test_whitespace_tokenizer()
152
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
153
+ t = WhiteSpaceTokenizer.new(input)
154
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
155
+ assert_equal(Token.new('is', 19, 21), t.next)
156
+ assert_equal(Token.new('My', 22, 24), t.next)
157
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
158
+ assert_equal(Token.new('52', 32, 34), t.next)
159
+ assert_equal(Token.new('#$', 37, 39), t.next)
160
+ assert_equal(Token.new('address.', 40, 48), t.next)
161
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
162
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
163
+ assert(! t.next())
164
+ t.text = "one_two three"
165
+ assert_equal(Token.new("one_two", 0, 7), t.next())
166
+ assert_equal(Token.new("three", 8, 13), t.next())
167
+ assert(! t.next())
168
+ t = LowerCaseFilter.new(WhiteSpaceTokenizer.new(input))
169
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
170
+ assert_equal(Token.new('is', 19, 21), t.next)
171
+ assert_equal(Token.new('my', 22, 24), t.next)
172
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
173
+ assert_equal(Token.new('52', 32, 34), t.next)
174
+ assert_equal(Token.new('#$', 37, 39), t.next)
175
+ assert_equal(Token.new('address.', 40, 48), t.next)
176
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
177
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
178
+ assert(! t.next())
179
+ t = WhiteSpaceTokenizer.new(input, true)
180
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
181
+ assert_equal(Token.new('is', 19, 21), t.next)
182
+ assert_equal(Token.new('my', 22, 24), t.next)
183
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
184
+ assert_equal(Token.new('52', 32, 34), t.next)
185
+ assert_equal(Token.new('#$', 37, 39), t.next)
186
+ assert_equal(Token.new('address.', 40, 48), t.next)
187
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
188
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
189
+ assert(! t.next())
190
+ end
191
+ end if (/utf-8/i =~ Ferret.locale)
192
+
193
+ class AsciiStandardTokenizerTest < Test::Unit::TestCase
194
+ include Ferret::Analysis
195
+
196
+ def test_standard_tokenizer()
197
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
198
+ t = AsciiStandardTokenizer.new(input)
199
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
200
+ assert_equal(Token.new('is', 19, 21), t.next)
201
+ assert_equal(Token.new('My', 22, 24), t.next)
202
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
203
+ assert_equal(Token.new('52', 32, 34), t.next)
204
+ assert_equal(Token.new('Address', 40, 47), t.next)
205
+ assert_equal(Token.new('23', 49, 51), t.next)
206
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
207
+ assert_equal(Token.new('TNT', 86, 91), t.next)
208
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
209
+ assert(! t.next())
210
+ t.text = "one_two three"
211
+ assert_equal(Token.new("one_two", 0, 7), t.next())
212
+ assert_equal(Token.new("three", 8, 13), t.next())
213
+ assert(! t.next())
214
+ t = AsciiLowerCaseFilter.new(AsciiStandardTokenizer.new(input))
215
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
216
+ assert_equal(Token.new('is', 19, 21), t.next)
217
+ assert_equal(Token.new('my', 22, 24), t.next)
218
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
219
+ assert_equal(Token.new('52', 32, 34), t.next)
220
+ assert_equal(Token.new('address', 40, 47), t.next)
221
+ assert_equal(Token.new('23', 49, 51), t.next)
222
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
223
+ assert_equal(Token.new('tnt', 86, 91), t.next)
224
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
225
+ assert(! t.next())
226
+ end
227
+ end
228
+
229
+ class StandardTokenizerTest < Test::Unit::TestCase
230
+ include Ferret::Analysis
231
+
232
+ def test_standard_tokenizer()
233
+ input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
234
+ t = StandardTokenizer.new(input)
235
+ assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
236
+ assert_equal(Token.new('is', 19, 21), t.next)
237
+ assert_equal(Token.new('My', 22, 24), t.next)
238
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
239
+ assert_equal(Token.new('52', 32, 34), t.next)
240
+ assert_equal(Token.new('Address', 40, 47), t.next)
241
+ assert_equal(Token.new('23', 49, 51), t.next)
242
+ assert_equal(Token.new('www.google.com/res_345', 55, 85), t.next)
243
+ assert_equal(Token.new('TNT', 86, 91), t.next)
244
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
245
+ assert_equal(Token.new('23', 111, 113), t.next)
246
+ assert_equal(Token.new('ÁÄGÇ', 117, 124), t.next)
247
+ assert_equal(Token.new('ÊËÌ', 126, 132), t.next)
248
+ assert_equal(Token.new('ÚØÃ', 134, 140), t.next)
249
+ assert_equal(Token.new('ÖÎÍ', 142, 148), t.next)
250
+ assert(! t.next())
251
+ t.text = "one_two three"
252
+ assert_equal(Token.new("one_two", 0, 7), t.next())
253
+ assert_equal(Token.new("three", 8, 13), t.next())
254
+ assert(! t.next())
255
+ t = LowerCaseFilter.new(StandardTokenizer.new(input))
256
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
257
+ assert_equal(Token.new('is', 19, 21), t.next)
258
+ assert_equal(Token.new('my', 22, 24), t.next)
259
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
260
+ assert_equal(Token.new('52', 32, 34), t.next)
261
+ assert_equal(Token.new('address', 40, 47), t.next)
262
+ assert_equal(Token.new('23', 49, 51), t.next)
263
+ assert_equal(Token.new('www.google.com/res_345', 55, 85), t.next)
264
+ assert_equal(Token.new('tnt', 86, 91), t.next)
265
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
266
+ assert_equal(Token.new('23', 111, 113), t.next)
267
+ assert_equal(Token.new('áägç', 117, 124), t.next)
268
+ assert_equal(Token.new('êëì', 126, 132), t.next)
269
+ assert_equal(Token.new('úøã', 134, 140), t.next)
270
+ assert_equal(Token.new('öîí', 142, 148), t.next)
271
+ input = "e-mail 123-1235-asd-1234 http://www.davebalmain.com/trac-site/"
272
+ t = HyphenFilter.new(StandardTokenizer.new(input))
273
+ assert_equal(Token.new('email', 0, 6), t.next)
274
+ assert_equal(Token.new('e', 0, 1, 0), t.next)
275
+ assert_equal(Token.new('mail', 2, 6, 1), t.next)
276
+ assert_equal(Token.new('123-1235-asd-1234', 7, 24), t.next)
277
+ assert_equal(Token.new('www.davebalmain.com/trac-site', 25, 61), t.next)
278
+ assert(! t.next())
279
+ end
280
+ end if (/utf-8/i =~ Ferret.locale)
281
+
282
+ class RegExpTokenizerTest < Test::Unit::TestCase
283
+ include Ferret::Analysis
284
+
285
+ ALPHA = /[[:alpha:]_-]+/
286
+ APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
287
+ ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
288
+ ACRONYM_WORD = /^#{ACRONYM}$/
289
+ APOSTROPHE_WORD = /^#{APOSTROPHE}$/
290
+
291
+ def test_reg_exp_tokenizer()
292
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
293
+ t = RegExpTokenizer.new(input)
294
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
295
+ assert_equal(Token.new('is', 19, 21), t.next)
296
+ assert_equal(Token.new('My', 22, 24), t.next)
297
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
298
+ assert_equal(Token.new('52', 32, 34), t.next)
299
+ assert_equal(Token.new('Address', 40, 47), t.next)
300
+ assert_equal(Token.new('23', 49, 51), t.next)
301
+ assert_equal(Token.new('http://www.google.com/RESULT_3.html', 55, 90), t.next)
302
+ assert_equal(Token.new('T.N.T.', 91, 97), t.next)
303
+ assert_equal(Token.new('123-1235-ASD-1234', 98, 115), t.next)
304
+ assert_equal(Token.new('23', 116, 118), t.next)
305
+ assert_equal(Token.new('Rob\'s', 119, 124), t.next)
306
+ assert(! t.next())
307
+ t.text = "one_two three"
308
+ assert_equal(Token.new("one_two", 0, 7), t.next())
309
+ assert_equal(Token.new("three", 8, 13), t.next())
310
+ assert(! t.next())
311
+ t = LowerCaseFilter.new(RegExpTokenizer.new(input))
312
+ t2 = LowerCaseFilter.new(RegExpTokenizer.new(input, /\w{2,}/))
313
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
314
+ assert_equal(Token.new('is', 19, 21), t.next)
315
+ assert_equal(Token.new('my', 22, 24), t.next)
316
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
317
+ assert_equal(Token.new('52', 32, 34), t.next)
318
+ assert_equal(Token.new('address', 40, 47), t.next)
319
+ assert_equal(Token.new('23', 49, 51), t.next)
320
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
321
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
322
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
323
+ assert_equal(Token.new('23', 116, 118), t.next)
324
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
325
+ assert(! t.next())
326
+ assert_equal(Token.new('dbalmain', 0, 8), t2.next)
327
+ assert_equal(Token.new('gmail', 9, 14), t2.next)
328
+ assert_equal(Token.new('com', 15, 18), t2.next)
329
+ assert_equal(Token.new('is', 19, 21), t2.next)
330
+ assert_equal(Token.new('my', 22, 24), t2.next)
331
+ assert_equal(Token.new('mail', 27, 31), t2.next)
332
+ assert_equal(Token.new('52', 32, 34), t2.next)
333
+ assert_equal(Token.new('address', 40, 47), t2.next)
334
+ assert_equal(Token.new('23', 49, 51), t2.next)
335
+ assert_equal(Token.new('http', 55, 59), t2.next)
336
+ assert_equal(Token.new('www', 62, 65), t2.next)
337
+ assert_equal(Token.new('google', 66, 72), t2.next)
338
+ assert_equal(Token.new('com', 73, 76), t2.next)
339
+ assert_equal(Token.new('result_3', 77, 85), t2.next)
340
+ assert_equal(Token.new('html', 86, 90), t2.next)
341
+ assert_equal(Token.new('123', 98, 101), t2.next)
342
+ assert_equal(Token.new('1235', 102, 106), t2.next)
343
+ assert_equal(Token.new('asd', 107, 110), t2.next)
344
+ assert_equal(Token.new('1234', 111, 115), t2.next)
345
+ assert_equal(Token.new('23', 116, 118), t2.next)
346
+ assert_equal(Token.new('rob', 119, 122), t2.next)
347
+ assert(! t2.next())
348
+ t = RegExpTokenizer.new(input) do |str|
349
+ if str =~ ACRONYM_WORD
350
+ str.gsub!(/\./, '')
351
+ elsif str =~ APOSTROPHE_WORD
352
+ str.gsub!(/'[sS]$/, '')
353
+ end
354
+ str
355
+ end
356
+ t = LowerCaseFilter.new(t)
357
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
358
+ assert_equal(Token.new('is', 19, 21), t.next)
359
+ assert_equal(Token.new('my', 22, 24), t.next)
360
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
361
+ assert_equal(Token.new('52', 32, 34), t.next)
362
+ assert_equal(Token.new('address', 40, 47), t.next)
363
+ assert_equal(Token.new('23', 49, 51), t.next)
364
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
365
+ assert_equal(Token.new('tnt', 91, 97), t.next)
366
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
367
+ assert_equal(Token.new('23', 116, 118), t.next)
368
+ assert_equal(Token.new('rob', 119, 124), t.next)
369
+ assert(! t.next())
370
+ end
371
+ end
372
+
373
+ class MappingFilterTest < Test::Unit::TestCase
374
+ include Ferret::Analysis
375
+
376
+ def test_mapping_filter()
377
+ mapping = {
378
+ ['à','á','â','ã','ä','å','ā','ă'] => 'a',
379
+ 'æ' => 'ae',
380
+ ['ď','đ'] => 'd',
381
+ ['ç','ć','č','ĉ','ċ'] => 'c',
382
+ ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
383
+ ['ƒ'] => 'f',
384
+ ['ĝ','ğ','ġ','ģ'] => 'g',
385
+ ['ĥ','ħ'] => 'h',
386
+ ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
387
+ ['į','ı','ij','ĵ'] => 'j',
388
+ ['ķ','ĸ'] => 'k',
389
+ ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
390
+ ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
391
+ ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
392
+ 'œ' => 'oek',
393
+ 'ą' => 'q',
394
+ ['ŕ','ř','ŗ'] => 'r',
395
+ ['ś','š','ş','ŝ','ș'] => 's',
396
+ ['ť','ţ','ŧ','ț'] => 't',
397
+ ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
398
+ 'ŵ' => 'w',
399
+ ['ý','ÿ','ŷ'] => 'y',
400
+ ['ž','ż','ź'] => 'z'
401
+ }
402
+ input = <<END
403
+ aàáâãäåāăb cæd eďđf gçćčĉċh ièéêëēęěĕėj kƒl mĝğġģn oĥħp qììíîïīĩĭr sįıijĵt uķĸv
404
+ włľĺļŀx yñńňņʼnŋz aòóôõöøōőŏŏb cœd eąf gŕřŗh iśšşŝșj kťţŧțl mùúûüūůűŭũųn oŵp
405
+ qýÿŷr sžżźt
406
+ END
407
+ t = MappingFilter.new(LetterTokenizer.new(input), mapping)
408
+ assert_equal(Token.new('aaaaaaaaab', 0, 18), t.next)
409
+ assert_equal(Token.new('caed', 19, 23), t.next)
410
+ assert_equal(Token.new('eddf', 24, 30), t.next)
411
+ assert_equal(Token.new('gccccch', 31, 43), t.next)
412
+ assert_equal(Token.new('ieeeeeeeeej', 44, 64), t.next)
413
+ assert_equal(Token.new('kfl', 65, 69), t.next)
414
+ assert_equal(Token.new('mggggn', 70, 80), t.next)
415
+ assert_equal(Token.new('ohhp', 81, 87), t.next)
416
+ assert_equal(Token.new('qiiiiiiiir', 88, 106), t.next)
417
+ assert_equal(Token.new('sjjjjt', 107, 117), t.next)
418
+ assert_equal(Token.new('ukkv', 118, 124), t.next)
419
+ assert_equal(Token.new('wlllllx', 125, 137), t.next)
420
+ assert_equal(Token.new('ynnnnnnz', 138, 152), t.next)
421
+ assert_equal(Token.new('aoooooooooob', 153, 175), t.next)
422
+ assert_equal(Token.new('coekd', 176, 180), t.next)
423
+ assert_equal(Token.new('eqf', 181, 185), t.next)
424
+ assert_equal(Token.new('grrrh', 186, 194), t.next)
425
+ assert_equal(Token.new('isssssj', 195, 207), t.next)
426
+ assert_equal(Token.new('kttttl', 208, 218), t.next)
427
+ assert_equal(Token.new('muuuuuuuuuun', 219, 241), t.next)
428
+ assert_equal(Token.new('owp', 242, 246), t.next)
429
+ assert_equal(Token.new('qyyyr', 247, 255), t.next)
430
+ assert_equal(Token.new('szzzt', 256, 264), t.next)
431
+ assert(! t.next())
432
+ end
433
+ end if (/utf-8/i =~ Ferret.locale)
434
+
435
+ class StopFilterTest < Test::Unit::TestCase
436
+ include Ferret::Analysis
437
+
438
+ def test_stop_filter()
439
+ words = ["one", "four", "five", "seven"]
440
+ input = "one, two, three, four, five, six, seven, eight, nine, ten."
441
+ t = StopFilter.new(AsciiLetterTokenizer.new(input), words)
442
+ assert_equal(Token.new('two', 5, 8, 2), t.next)
443
+ assert_equal(Token.new('three', 10, 15, 1), t.next)
444
+ assert_equal(Token.new('six', 29, 32, 3), t.next)
445
+ assert_equal(Token.new('eight', 41, 46, 2), t.next)
446
+ assert_equal(Token.new('nine', 48, 52, 1), t.next)
447
+ assert_equal(Token.new('ten', 54, 57, 1), t.next)
448
+ assert(! t.next())
449
+ end
450
+ end
451
+
452
+ class StemFilterTest < Test::Unit::TestCase
453
+ include Ferret::Analysis
454
+
455
+ def test_stop_filter()
456
+ input = "Debate Debates DEBATED DEBating Debater";
457
+ t = StemFilter.new(AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(input)),
458
+ "english")
459
+ assert_equal(Token.new("debat", 0, 6), t.next)
460
+ assert_equal(Token.new("debat", 7, 14), t.next)
461
+ assert_equal(Token.new("debat", 15, 22), t.next)
462
+ assert_equal(Token.new("debat", 23, 31), t.next)
463
+ assert_equal(Token.new("debat", 32, 39), t.next)
464
+ assert(! t.next())
465
+ t = StemFilter.new(AsciiLetterTokenizer.new(input), :english)
466
+ assert_equal(Token.new("Debat", 0, 6), t.next)
467
+ assert_equal(Token.new("Debat", 7, 14), t.next)
468
+ assert_equal(Token.new("DEBATED", 15, 22), t.next)
469
+ assert_equal(Token.new("DEBate", 23, 31), t.next)
470
+ assert_equal(Token.new("Debat", 32, 39), t.next)
471
+
472
+ if Ferret.locale and Ferret.locale.downcase.index("utf")
473
+ input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
474
+ t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
475
+ assert_equal(Token.new("dêbate", 0, 7), t.next)
476
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
477
+ assert_equal(Token.new("dêbate", 17, 25), t.next)
478
+ assert_equal(Token.new("dêbate", 26, 35), t.next)
479
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
480
+ t = StemFilter.new(LetterTokenizer.new(input), :english)
481
+ assert_equal(Token.new("Dêbate", 0, 7), t.next)
482
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
483
+ assert_equal(Token.new("DÊBATED", 17, 25), t.next)
484
+ assert_equal(Token.new("DÊBATing", 26, 35), t.next)
485
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
486
+ assert(! t.next())
487
+ end
488
+
489
+ tz = AsciiLetterTokenizer.new(input)
490
+ assert_not_nil(StemFilter.new(tz,'HunGarIaN', 'Utf-8'))
491
+ assert_not_nil(StemFilter.new(tz,'romanIAN', 'iso-8859-2'))
492
+ assert_raises(ArgumentError) {StemFilter.new(tz, 'Jibberish', 'UTF-8')}
493
+ end
494
+ end
495
+
496
+ require 'strscan'
497
+ module Ferret::Analysis
498
+
499
+ class MyRegExpTokenizer < TokenStream
500
+
501
+ def initialize(input)
502
+ @ss = StringScanner.new(input)
503
+ end
504
+
505
+ # Returns the next token in the stream, or null at EOS.
506
+ def next()
507
+ if @ss.scan_until(token_re)
508
+ term = @ss.matched
509
+ term_end = @ss.pos
510
+ term_start = term_end - term.size
511
+ else
512
+ return nil
513
+ end
514
+
515
+ return Token.new(normalize(term), term_start, term_end)
516
+ end
517
+
518
+ def text=(text)
519
+ @ss = StringScanner.new(text)
520
+ end
521
+
522
+
523
+ protected
524
+ # returns the regular expression used to find the next token
525
+ TOKEN_RE = /[[:alpha:]]+/
526
+ def token_re
527
+ TOKEN_RE
528
+ end
529
+
530
+ # Called on each token to normalize it before it is added to the
531
+ # token. The default implementation does nothing. Subclasses may
532
+ # use this to, e.g., lowercase tokens.
533
+ def normalize(str) return str end
534
+ end
535
+
536
+ class MyReverseTokenFilter < TokenStream
537
+ def initialize(token_stream)
538
+ @token_stream = token_stream
539
+ end
540
+
541
+ def text=(text)
542
+ @token_stream.text = text
543
+ end
544
+
545
+ def next()
546
+ if token = @token_stream.next
547
+ token.text = token.text.reverse
548
+ end
549
+ token
550
+ end
551
+ end
552
+
553
+ class MyCSVTokenizer < MyRegExpTokenizer
554
+ protected
555
+ # returns the regular expression used to find the next token
556
+ TOKEN_RE = /[^,]+/
557
+ def token_re
558
+ TOKEN_RE
559
+ end
560
+
561
+ # Called on each token to normalize it before it is added to the
562
+ # token. The default implementation does nothing. Subclasses may
563
+ # use this to, e.g., lowercase tokens.
564
+ def normalize(str) return str.upcase end
565
+ end
566
+ end
567
+
568
+ class CustomTokenizerTest < Test::Unit::TestCase
569
+ include Ferret::Analysis
570
+
571
+ def test_custom_tokenizer()
572
+ input = "First Field,2nd Field, P a d d e d F i e l d "
573
+ t = MyCSVTokenizer.new(input)
574
+ assert_equal(Token.new("FIRST FIELD", 0, 11), t.next)
575
+ assert_equal(Token.new("2ND FIELD", 12, 21), t.next)
576
+ assert_equal(Token.new(" P A D D E D F I E L D ", 22, 48), t.next)
577
+ assert(! t.next())
578
+ t = AsciiLowerCaseFilter.new(MyCSVTokenizer.new(input))
579
+ assert_equal(Token.new("first field", 0, 11), t.next)
580
+ assert_equal(Token.new("2nd field", 12, 21), t.next)
581
+ assert_equal(Token.new(" p a d d e d f i e l d ", 22, 48), t.next)
582
+ assert(! t.next())
583
+ t = MyReverseTokenFilter.new(
584
+ AsciiLowerCaseFilter.new(MyCSVTokenizer.new(input)))
585
+ assert_equal(Token.new("dleif tsrif", 0, 11), t.next)
586
+ assert_equal(Token.new("dleif dn2", 12, 21), t.next)
587
+ assert_equal(Token.new(" d l e i f d e d d a p ", 22, 48), t.next)
588
+ t.text = "one,TWO,three"
589
+ assert_equal(Token.new("eno", 0, 3), t.next)
590
+ assert_equal(Token.new("owt", 4, 7), t.next)
591
+ assert_equal(Token.new("eerht", 8, 13), t.next)
592
+ t = AsciiLowerCaseFilter.new(
593
+ MyReverseTokenFilter.new(MyCSVTokenizer.new(input)))
594
+ assert_equal(Token.new("dleif tsrif", 0, 11), t.next)
595
+ assert_equal(Token.new("dleif dn2", 12, 21), t.next)
596
+ assert_equal(Token.new(" d l e i f d e d d a p ", 22, 48), t.next)
597
+ t.text = "one,TWO,three"
598
+ assert_equal(Token.new("eno", 0, 3), t.next)
599
+ assert_equal(Token.new("owt", 4, 7), t.next)
600
+ assert_equal(Token.new("eerht", 8, 13), t.next)
601
+ end
602
+ end
603
+
604
+ module Ferret::Analysis
605
+ class TokenFilter < TokenStream
606
+ protected
607
+ # Construct a token stream filtering the given input.
608
+ def initialize(input)
609
+ @input = input
610
+ end
611
+ end
612
+
613
+ # Normalizes token text to lower case.
614
+ class CapitalizeFilter < TokenFilter
615
+ def next()
616
+ t = @input.next()
617
+
618
+ return nil if (t.nil?)
619
+
620
+ t.text = t.text.capitalize
621
+
622
+ return t
623
+ end
624
+ end
625
+ end
626
+
627
+ class CustomFilterTest < Test::Unit::TestCase
628
+ include Ferret::Analysis
629
+
630
+ def test_custom_filter()
631
+ input = "This text SHOULD be capitalized ... I hope. :-S"
632
+ t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
633
+ assert_equal(Token.new("This", 0, 4), t.next)
634
+ assert_equal(Token.new("Text", 5, 9), t.next)
635
+ assert_equal(Token.new("Should", 10, 16), t.next)
636
+ assert_equal(Token.new("Be", 17, 19), t.next)
637
+ assert_equal(Token.new("Capitalized", 20, 31), t.next)
638
+ assert_equal(Token.new("I", 36, 37), t.next)
639
+ assert_equal(Token.new("Hope", 38, 42), t.next)
640
+ assert_equal(Token.new("S", 46, 47), t.next)
641
+ assert(! t.next())
642
+ t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
643
+ assert_equal(Token.new("This", 0, 4), t.next)
644
+ assert_equal(Token.new("Text", 5, 9), t.next)
645
+ assert_equal(Token.new("Should", 10, 16), t.next)
646
+ assert_equal(Token.new("Be", 17, 19), t.next)
647
+ assert_equal(Token.new("Capit", 20, 31), t.next)
648
+ assert_equal(Token.new("I", 36, 37), t.next)
649
+ assert_equal(Token.new("Hope", 38, 42), t.next)
650
+ assert_equal(Token.new("S", 46, 47), t.next)
651
+ assert(! t.next())
652
+ end
653
+ end