sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,646 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ puts "Loading once"
4
+ class TokenTest < Test::Unit::TestCase
5
+ include Ferret::Analysis
6
+ def test_token
7
+ t = Token.new("text", 1, 2, 3)
8
+ assert_equal("text", t.text)
9
+ assert_equal(1, t.start)
10
+ assert_equal(2, t.end)
11
+ assert_equal(3, t.pos_inc)
12
+ t.text = "yada yada yada"
13
+ t.start = 11
14
+ t.end = 12
15
+ t.pos_inc = 13
16
+ assert_equal("yada yada yada", t.text)
17
+ assert_equal(11, t.start)
18
+ assert_equal(12, t.end)
19
+ assert_equal(13, t.pos_inc)
20
+
21
+ t = Token.new("text", 1, 2)
22
+ assert_equal(1, t.pos_inc)
23
+ end
24
+ end
25
+
26
+ class AsciiLetterTokenizerTest < Test::Unit::TestCase
27
+ include Ferret::Analysis
28
+
29
+ def test_letter_tokenizer()
30
+ input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#@$'
31
+ t = AsciiLetterTokenizer.new(input)
32
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
33
+ assert_equal(Token.new("gmail", 9, 14), t.next())
34
+ assert_equal(Token.new("com", 15, 18), t.next())
35
+ assert_equal(Token.new("is", 19, 21), t.next())
36
+ assert_equal(Token.new("My", 22, 24), t.next())
37
+ assert_equal(Token.new("e", 25, 26), t.next())
38
+ assert_equal(Token.new("mail", 27, 31), t.next())
39
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
40
+ assert(! t.next())
41
+ t.text = "one_two three"
42
+ assert_equal(Token.new("one", 0, 3), t.next())
43
+ assert_equal(Token.new("two", 4, 7), t.next())
44
+ assert_equal(Token.new("three", 8, 13), t.next())
45
+ assert(! t.next())
46
+ t = AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(input))
47
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
48
+ assert_equal(Token.new("gmail", 9, 14), t.next())
49
+ assert_equal(Token.new("com", 15, 18), t.next())
50
+ assert_equal(Token.new("is", 19, 21), t.next())
51
+ assert_equal(Token.new("my", 22, 24), t.next())
52
+ assert_equal(Token.new("e", 25, 26), t.next())
53
+ assert_equal(Token.new("mail", 27, 31), t.next())
54
+ assert_equal(Token.new("address", 39, 46), t.next())
55
+ assert(! t.next())
56
+ end
57
+ end
58
+
59
+ class LetterTokenizerTest < Test::Unit::TestCase
60
+ include Ferret::Analysis
61
+
62
+ def test_letter_tokenizer()
63
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
64
+ t = LetterTokenizer.new(input)
65
+ assert_equal(Token.new('DBalmän', 0, 8), t.next)
66
+ assert_equal(Token.new('gmail', 9, 14), t.next)
67
+ assert_equal(Token.new('com', 15, 18), t.next)
68
+ assert_equal(Token.new('is', 19, 21), t.next)
69
+ assert_equal(Token.new('My', 22, 24), t.next)
70
+ assert_equal(Token.new('e', 25, 26), t.next)
71
+ assert_equal(Token.new('mail', 27, 31), t.next)
72
+ assert_equal(Token.new('address', 40, 47), t.next)
73
+ assert_equal(Token.new('ÁÄGÇ', 55, 62), t.next)
74
+ assert_equal(Token.new('ÊËÌ', 64, 70), t.next)
75
+ assert_equal(Token.new('ÚØÃ', 72, 78), t.next)
76
+ assert_equal(Token.new('ÖÎÍ', 80, 86), t.next)
77
+ assert(! t.next())
78
+ t.text = "one_two three"
79
+ assert_equal(Token.new("one", 0, 3), t.next())
80
+ assert_equal(Token.new("two", 4, 7), t.next())
81
+ assert_equal(Token.new("three", 8, 13), t.next())
82
+ assert(! t.next())
83
+ t = LowerCaseFilter.new(LetterTokenizer.new(input))
84
+ assert_equal(Token.new('dbalmän', 0, 8), t.next)
85
+ assert_equal(Token.new('gmail', 9, 14), t.next)
86
+ assert_equal(Token.new('com', 15, 18), t.next)
87
+ assert_equal(Token.new('is', 19, 21), t.next)
88
+ assert_equal(Token.new('my', 22, 24), t.next)
89
+ assert_equal(Token.new('e', 25, 26), t.next)
90
+ assert_equal(Token.new('mail', 27, 31), t.next)
91
+ assert_equal(Token.new('address', 40, 47), t.next)
92
+ assert_equal(Token.new('áägç', 55, 62), t.next)
93
+ assert_equal(Token.new('êëì', 64, 70), t.next)
94
+ assert_equal(Token.new('úøã', 72, 78), t.next)
95
+ assert_equal(Token.new('öîí', 80, 86), t.next)
96
+ assert(! t.next())
97
+ t = LetterTokenizer.new(input, true)
98
+ assert_equal(Token.new('dbalmän', 0, 8), t.next)
99
+ assert_equal(Token.new('gmail', 9, 14), t.next)
100
+ assert_equal(Token.new('com', 15, 18), t.next)
101
+ assert_equal(Token.new('is', 19, 21), t.next)
102
+ assert_equal(Token.new('my', 22, 24), t.next)
103
+ assert_equal(Token.new('e', 25, 26), t.next)
104
+ assert_equal(Token.new('mail', 27, 31), t.next)
105
+ assert_equal(Token.new('address', 40, 47), t.next)
106
+ assert_equal(Token.new('áägç', 55, 62), t.next)
107
+ assert_equal(Token.new('êëì', 64, 70), t.next)
108
+ assert_equal(Token.new('úøã', 72, 78), t.next)
109
+ assert_equal(Token.new('öîí', 80, 86), t.next)
110
+ assert(! t.next())
111
+ end
112
+ end if (/utf-8/i =~ Ferret.locale)
113
+
114
+ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
115
+ include Ferret::Analysis
116
+
117
+ def test_whitespace_tokenizer()
118
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ ADDRESS. 23#@$'
119
+ t = AsciiWhiteSpaceTokenizer.new(input)
120
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
121
+ assert_equal(Token.new('is', 19, 21), t.next)
122
+ assert_equal(Token.new('My', 22, 24), t.next)
123
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
124
+ assert_equal(Token.new('52', 32, 34), t.next)
125
+ assert_equal(Token.new('#$', 37, 39), t.next)
126
+ assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
127
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
128
+ assert(! t.next())
129
+ t.text = "one_two three"
130
+ assert_equal(Token.new("one_two", 0, 7), t.next())
131
+ assert_equal(Token.new("three", 8, 13), t.next())
132
+ assert(! t.next())
133
+ t = AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(input))
134
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
135
+ assert_equal(Token.new('is', 19, 21), t.next)
136
+ assert_equal(Token.new('my', 22, 24), t.next)
137
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
138
+ assert_equal(Token.new('52', 32, 34), t.next)
139
+ assert_equal(Token.new('#$', 37, 39), t.next)
140
+ assert_equal(Token.new('address.', 40, 48), t.next)
141
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
142
+ assert(! t.next())
143
+ end
144
+ end
145
+
146
+ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
147
+ include Ferret::Analysis
148
+
149
+ def test_whitespace_tokenizer()
150
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
151
+ t = WhiteSpaceTokenizer.new(input)
152
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
153
+ assert_equal(Token.new('is', 19, 21), t.next)
154
+ assert_equal(Token.new('My', 22, 24), t.next)
155
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
156
+ assert_equal(Token.new('52', 32, 34), t.next)
157
+ assert_equal(Token.new('#$', 37, 39), t.next)
158
+ assert_equal(Token.new('address.', 40, 48), t.next)
159
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
160
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
161
+ assert(! t.next())
162
+ t.text = "one_two three"
163
+ assert_equal(Token.new("one_two", 0, 7), t.next())
164
+ assert_equal(Token.new("three", 8, 13), t.next())
165
+ assert(! t.next())
166
+ t = LowerCaseFilter.new(WhiteSpaceTokenizer.new(input))
167
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
168
+ assert_equal(Token.new('is', 19, 21), t.next)
169
+ assert_equal(Token.new('my', 22, 24), t.next)
170
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
171
+ assert_equal(Token.new('52', 32, 34), t.next)
172
+ assert_equal(Token.new('#$', 37, 39), t.next)
173
+ assert_equal(Token.new('address.', 40, 48), t.next)
174
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
175
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
176
+ assert(! t.next())
177
+ t = WhiteSpaceTokenizer.new(input, true)
178
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
179
+ assert_equal(Token.new('is', 19, 21), t.next)
180
+ assert_equal(Token.new('my', 22, 24), t.next)
181
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
182
+ assert_equal(Token.new('52', 32, 34), t.next)
183
+ assert_equal(Token.new('#$', 37, 39), t.next)
184
+ assert_equal(Token.new('address.', 40, 48), t.next)
185
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
186
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
187
+ assert(! t.next())
188
+ end
189
+ end if (/utf-8/i =~ Ferret.locale)
190
+
191
+ class AsciiStandardTokenizerTest < Test::Unit::TestCase
192
+ include Ferret::Analysis
193
+
194
+ def test_standard_tokenizer()
195
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
196
+ t = AsciiStandardTokenizer.new(input)
197
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
198
+ assert_equal(Token.new('is', 19, 21), t.next)
199
+ assert_equal(Token.new('My', 22, 24), t.next)
200
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
201
+ assert_equal(Token.new('52', 32, 34), t.next)
202
+ assert_equal(Token.new('Address', 40, 47), t.next)
203
+ assert_equal(Token.new('23', 49, 51), t.next)
204
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
205
+ assert_equal(Token.new('TNT', 86, 91), t.next)
206
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
207
+ assert(! t.next())
208
+ t.text = "one_two three"
209
+ assert_equal(Token.new("one_two", 0, 7), t.next())
210
+ assert_equal(Token.new("three", 8, 13), t.next())
211
+ assert(! t.next())
212
+ t = AsciiLowerCaseFilter.new(AsciiStandardTokenizer.new(input))
213
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
214
+ assert_equal(Token.new('is', 19, 21), t.next)
215
+ assert_equal(Token.new('my', 22, 24), t.next)
216
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
217
+ assert_equal(Token.new('52', 32, 34), t.next)
218
+ assert_equal(Token.new('address', 40, 47), t.next)
219
+ assert_equal(Token.new('23', 49, 51), t.next)
220
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
221
+ assert_equal(Token.new('tnt', 86, 91), t.next)
222
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
223
+ assert(! t.next())
224
+ end
225
+ end
226
+
227
+ class StandardTokenizerTest < Test::Unit::TestCase
228
+ include Ferret::Analysis
229
+
230
+ def test_standard_tokenizer()
231
+ input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
232
+ t = StandardTokenizer.new(input)
233
+ assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
234
+ assert_equal(Token.new('is', 19, 21), t.next)
235
+ assert_equal(Token.new('My', 22, 24), t.next)
236
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
237
+ assert_equal(Token.new('52', 32, 34), t.next)
238
+ assert_equal(Token.new('Address', 40, 47), t.next)
239
+ assert_equal(Token.new('23', 49, 51), t.next)
240
+ assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
241
+ assert_equal(Token.new('TNT', 86, 91), t.next)
242
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
243
+ assert_equal(Token.new('23', 111, 113), t.next)
244
+ assert_equal(Token.new('ÁÄGÇ', 117, 124), t.next)
245
+ assert_equal(Token.new('ÊËÌ', 126, 132), t.next)
246
+ assert_equal(Token.new('ÚØÃ', 134, 140), t.next)
247
+ assert_equal(Token.new('ÖÎÍ', 142, 148), t.next)
248
+ assert(! t.next())
249
+ t.text = "one_two three"
250
+ assert_equal(Token.new("one_two", 0, 7), t.next())
251
+ assert_equal(Token.new("three", 8, 13), t.next())
252
+ assert(! t.next())
253
+ t = LowerCaseFilter.new(StandardTokenizer.new(input))
254
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
255
+ assert_equal(Token.new('is', 19, 21), t.next)
256
+ assert_equal(Token.new('my', 22, 24), t.next)
257
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
258
+ assert_equal(Token.new('52', 32, 34), t.next)
259
+ assert_equal(Token.new('address', 40, 47), t.next)
260
+ assert_equal(Token.new('23', 49, 51), t.next)
261
+ assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
262
+ assert_equal(Token.new('tnt', 86, 91), t.next)
263
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
264
+ assert_equal(Token.new('23', 111, 113), t.next)
265
+ assert_equal(Token.new('áägç', 117, 124), t.next)
266
+ assert_equal(Token.new('êëì', 126, 132), t.next)
267
+ assert_equal(Token.new('úøã', 134, 140), t.next)
268
+ assert_equal(Token.new('öîí', 142, 148), t.next)
269
+ input = "e-mail 123-1235-asd-1234 http://www.davebalmain.com/trac-site/"
270
+ t = HyphenFilter.new(StandardTokenizer.new(input))
271
+ assert_equal(Token.new('email', 0, 6), t.next)
272
+ assert_equal(Token.new('e', 0, 1, 0), t.next)
273
+ assert_equal(Token.new('mail', 2, 6, 1), t.next)
274
+ assert_equal(Token.new('123-1235-asd-1234', 7, 24), t.next)
275
+ assert_equal(Token.new('www.davebalmain.com/trac-site', 25, 61), t.next)
276
+ assert(! t.next())
277
+ end
278
+ end if (/utf-8/i =~ Ferret.locale)
279
+
280
+ class RegExpTokenizerTest < Test::Unit::TestCase
281
+ include Ferret::Analysis
282
+
283
+ ALPHA = /[[:alpha:]_-]+/
284
+ APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
285
+ ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
286
+ ACRONYM_WORD = /^#{ACRONYM}$/
287
+ APOSTROPHE_WORD = /^#{APOSTROPHE}$/
288
+
289
+ def test_reg_exp_tokenizer()
290
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
291
+ t = RegExpTokenizer.new(input)
292
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
293
+ assert_equal(Token.new('is', 19, 21), t.next)
294
+ assert_equal(Token.new('My', 22, 24), t.next)
295
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
296
+ assert_equal(Token.new('52', 32, 34), t.next)
297
+ assert_equal(Token.new('Address', 40, 47), t.next)
298
+ assert_equal(Token.new('23', 49, 51), t.next)
299
+ assert_equal(Token.new('http://www.google.com/RESULT_3.html', 55, 90), t.next)
300
+ assert_equal(Token.new('T.N.T.', 91, 97), t.next)
301
+ assert_equal(Token.new('123-1235-ASD-1234', 98, 115), t.next)
302
+ assert_equal(Token.new('23', 116, 118), t.next)
303
+ assert_equal(Token.new('Rob\'s', 119, 124), t.next)
304
+ assert(! t.next())
305
+ t.text = "one_two three"
306
+ assert_equal(Token.new("one_two", 0, 7), t.next())
307
+ assert_equal(Token.new("three", 8, 13), t.next())
308
+ assert(! t.next())
309
+ t = LowerCaseFilter.new(RegExpTokenizer.new(input))
310
+ t2 = LowerCaseFilter.new(RegExpTokenizer.new(input, /\w{2,}/))
311
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
312
+ assert_equal(Token.new('is', 19, 21), t.next)
313
+ assert_equal(Token.new('my', 22, 24), t.next)
314
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
315
+ assert_equal(Token.new('52', 32, 34), t.next)
316
+ assert_equal(Token.new('address', 40, 47), t.next)
317
+ assert_equal(Token.new('23', 49, 51), t.next)
318
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
319
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
320
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
321
+ assert_equal(Token.new('23', 116, 118), t.next)
322
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
323
+ assert(! t.next())
324
+ assert_equal(Token.new('dbalmain', 0, 8), t2.next)
325
+ assert_equal(Token.new('gmail', 9, 14), t2.next)
326
+ assert_equal(Token.new('com', 15, 18), t2.next)
327
+ assert_equal(Token.new('is', 19, 21), t2.next)
328
+ assert_equal(Token.new('my', 22, 24), t2.next)
329
+ assert_equal(Token.new('mail', 27, 31), t2.next)
330
+ assert_equal(Token.new('52', 32, 34), t2.next)
331
+ assert_equal(Token.new('address', 40, 47), t2.next)
332
+ assert_equal(Token.new('23', 49, 51), t2.next)
333
+ assert_equal(Token.new('http', 55, 59), t2.next)
334
+ assert_equal(Token.new('www', 62, 65), t2.next)
335
+ assert_equal(Token.new('google', 66, 72), t2.next)
336
+ assert_equal(Token.new('com', 73, 76), t2.next)
337
+ assert_equal(Token.new('result_3', 77, 85), t2.next)
338
+ assert_equal(Token.new('html', 86, 90), t2.next)
339
+ assert_equal(Token.new('123', 98, 101), t2.next)
340
+ assert_equal(Token.new('1235', 102, 106), t2.next)
341
+ assert_equal(Token.new('asd', 107, 110), t2.next)
342
+ assert_equal(Token.new('1234', 111, 115), t2.next)
343
+ assert_equal(Token.new('23', 116, 118), t2.next)
344
+ assert_equal(Token.new('rob', 119, 122), t2.next)
345
+ assert(! t2.next())
346
+ t = RegExpTokenizer.new(input) do |str|
347
+ if str =~ ACRONYM_WORD
348
+ str.gsub!(/\./, '')
349
+ elsif str =~ APOSTROPHE_WORD
350
+ str.gsub!(/'[sS]$/, '')
351
+ end
352
+ str
353
+ end
354
+ t = LowerCaseFilter.new(t)
355
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
356
+ assert_equal(Token.new('is', 19, 21), t.next)
357
+ assert_equal(Token.new('my', 22, 24), t.next)
358
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
359
+ assert_equal(Token.new('52', 32, 34), t.next)
360
+ assert_equal(Token.new('address', 40, 47), t.next)
361
+ assert_equal(Token.new('23', 49, 51), t.next)
362
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
363
+ assert_equal(Token.new('tnt', 91, 97), t.next)
364
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
365
+ assert_equal(Token.new('23', 116, 118), t.next)
366
+ assert_equal(Token.new('rob', 119, 124), t.next)
367
+ assert(! t.next())
368
+ end
369
+ end
370
+
371
+ class MappingFilterTest < Test::Unit::TestCase
372
+ include Ferret::Analysis
373
+
374
+ def test_mapping_filter()
375
+ mapping = {
376
+ ['à','á','â','ã','ä','å','ā','ă'] => 'a',
377
+ 'æ' => 'ae',
378
+ ['ď','đ'] => 'd',
379
+ ['ç','ć','č','ĉ','ċ'] => 'c',
380
+ ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
381
+ ['ƒ'] => 'f',
382
+ ['ĝ','ğ','ġ','ģ'] => 'g',
383
+ ['ĥ','ħ'] => 'h',
384
+ ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
385
+ ['į','ı','ij','ĵ'] => 'j',
386
+ ['ķ','ĸ'] => 'k',
387
+ ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
388
+ ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
389
+ ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
390
+ 'œ' => 'oek',
391
+ 'ą' => 'q',
392
+ ['ŕ','ř','ŗ'] => 'r',
393
+ ['ś','š','ş','ŝ','ș'] => 's',
394
+ ['ť','ţ','ŧ','ț'] => 't',
395
+ ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
396
+ 'ŵ' => 'w',
397
+ ['ý','ÿ','ŷ'] => 'y',
398
+ ['ž','ż','ź'] => 'z'
399
+ }
400
+ input = <<END
401
+ aàáâãäåāăb cæd eďđf gçćčĉċh ièéêëēęěĕėj kƒl mĝğġģn oĥħp qììíîïīĩĭr sįıijĵt uķĸv
402
+ włľĺļŀx yñńňņʼnŋz aòóôõöøōőŏŏb cœd eąf gŕřŗh iśšşŝșj kťţŧțl mùúûüūůűŭũųn oŵp
403
+ qýÿŷr sžżźt
404
+ END
405
+ t = MappingFilter.new(LetterTokenizer.new(input), mapping)
406
+ assert_equal(Token.new('aaaaaaaaab', 0, 18), t.next)
407
+ assert_equal(Token.new('caed', 19, 23), t.next)
408
+ assert_equal(Token.new('eddf', 24, 30), t.next)
409
+ assert_equal(Token.new('gccccch', 31, 43), t.next)
410
+ assert_equal(Token.new('ieeeeeeeeej', 44, 64), t.next)
411
+ assert_equal(Token.new('kfl', 65, 69), t.next)
412
+ assert_equal(Token.new('mggggn', 70, 80), t.next)
413
+ assert_equal(Token.new('ohhp', 81, 87), t.next)
414
+ assert_equal(Token.new('qiiiiiiiir', 88, 106), t.next)
415
+ assert_equal(Token.new('sjjjjt', 107, 117), t.next)
416
+ assert_equal(Token.new('ukkv', 118, 124), t.next)
417
+ assert_equal(Token.new('wlllllx', 125, 137), t.next)
418
+ assert_equal(Token.new('ynnnnnnz', 138, 152), t.next)
419
+ assert_equal(Token.new('aoooooooooob', 153, 175), t.next)
420
+ assert_equal(Token.new('coekd', 176, 180), t.next)
421
+ assert_equal(Token.new('eqf', 181, 185), t.next)
422
+ assert_equal(Token.new('grrrh', 186, 194), t.next)
423
+ assert_equal(Token.new('isssssj', 195, 207), t.next)
424
+ assert_equal(Token.new('kttttl', 208, 218), t.next)
425
+ assert_equal(Token.new('muuuuuuuuuun', 219, 241), t.next)
426
+ assert_equal(Token.new('owp', 242, 246), t.next)
427
+ assert_equal(Token.new('qyyyr', 247, 255), t.next)
428
+ assert_equal(Token.new('szzzt', 256, 264), t.next)
429
+ assert(! t.next())
430
+ end
431
+ end if (/utf-8/i =~ Ferret.locale)
432
+
433
+ class StopFilterTest < Test::Unit::TestCase
434
+ include Ferret::Analysis
435
+
436
+ def test_stop_filter()
437
+ words = ["one", "four", "five", "seven"]
438
+ input = "one, two, three, four, five, six, seven, eight, nine, ten."
439
+ t = StopFilter.new(AsciiLetterTokenizer.new(input), words)
440
+ assert_equal(Token.new('two', 5, 8, 2), t.next)
441
+ assert_equal(Token.new('three', 10, 15, 1), t.next)
442
+ assert_equal(Token.new('six', 29, 32, 3), t.next)
443
+ assert_equal(Token.new('eight', 41, 46, 2), t.next)
444
+ assert_equal(Token.new('nine', 48, 52, 1), t.next)
445
+ assert_equal(Token.new('ten', 54, 57, 1), t.next)
446
+ assert(! t.next())
447
+ end
448
+ end
449
+
450
+ class StemFilterTest < Test::Unit::TestCase
451
+ include Ferret::Analysis
452
+
453
+ def test_stop_filter()
454
+ input = "Debate Debates DEBATED DEBating Debater";
455
+ t = StemFilter.new(AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(input)),
456
+ "english")
457
+ assert_equal(Token.new("debat", 0, 6), t.next)
458
+ assert_equal(Token.new("debat", 7, 14), t.next)
459
+ assert_equal(Token.new("debat", 15, 22), t.next)
460
+ assert_equal(Token.new("debat", 23, 31), t.next)
461
+ assert_equal(Token.new("debat", 32, 39), t.next)
462
+ assert(! t.next())
463
+ t = StemFilter.new(AsciiLetterTokenizer.new(input), :english)
464
+ assert_equal(Token.new("Debat", 0, 6), t.next)
465
+ assert_equal(Token.new("Debat", 7, 14), t.next)
466
+ assert_equal(Token.new("DEBATED", 15, 22), t.next)
467
+ assert_equal(Token.new("DEBate", 23, 31), t.next)
468
+ assert_equal(Token.new("Debat", 32, 39), t.next)
469
+
470
+ if Ferret.locale and Ferret.locale.downcase.index("utf")
471
+ input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
472
+ t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
473
+ assert_equal(Token.new("dêbate", 0, 7), t.next)
474
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
475
+ assert_equal(Token.new("dêbate", 17, 25), t.next)
476
+ assert_equal(Token.new("dêbate", 26, 35), t.next)
477
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
478
+ t = StemFilter.new(LetterTokenizer.new(input), :english)
479
+ assert_equal(Token.new("Dêbate", 0, 7), t.next)
480
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
481
+ assert_equal(Token.new("DÊBATED", 17, 25), t.next)
482
+ assert_equal(Token.new("DÊBATing", 26, 35), t.next)
483
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
484
+ assert(! t.next())
485
+ end
486
+ end
487
+ end
488
+
489
+ require 'strscan'
490
+ module Ferret::Analysis
491
+
492
+ class MyRegExpTokenizer < TokenStream
493
+
494
+ def initialize(input)
495
+ @ss = StringScanner.new(input)
496
+ end
497
+
498
+ # Returns the next token in the stream, or null at EOS.
499
+ def next()
500
+ if @ss.scan_until(token_re)
501
+ term = @ss.matched
502
+ term_end = @ss.pos
503
+ term_start = term_end - term.size
504
+ else
505
+ return nil
506
+ end
507
+
508
+ return Token.new(normalize(term), term_start, term_end)
509
+ end
510
+
511
+ def text=(text)
512
+ @ss = StringScanner.new(text)
513
+ end
514
+
515
+
516
+ protected
517
+ # returns the regular expression used to find the next token
518
+ TOKEN_RE = /[[:alpha:]]+/
519
+ def token_re
520
+ TOKEN_RE
521
+ end
522
+
523
+ # Called on each token to normalize it before it is added to the
524
+ # token. The default implementation does nothing. Subclasses may
525
+ # use this to, e.g., lowercase tokens.
526
+ def normalize(str) return str end
527
+ end
528
+
529
+ class MyReverseTokenFilter < TokenStream
530
+ def initialize(token_stream)
531
+ @token_stream = token_stream
532
+ end
533
+
534
+ def text=(text)
535
+ @token_stream.text = text
536
+ end
537
+
538
+ def next()
539
+ if token = @token_stream.next
540
+ token.text = token.text.reverse
541
+ end
542
+ token
543
+ end
544
+ end
545
+
546
+ class MyCSVTokenizer < MyRegExpTokenizer
547
+ protected
548
+ # returns the regular expression used to find the next token
549
+ TOKEN_RE = /[^,]+/
550
+ def token_re
551
+ TOKEN_RE
552
+ end
553
+
554
+ # Called on each token to normalize it before it is added to the
555
+ # token. The default implementation does nothing. Subclasses may
556
+ # use this to, e.g., lowercase tokens.
557
+ def normalize(str) return str.upcase end
558
+ end
559
+ end
560
+
561
+ class CustomTokenizerTest < Test::Unit::TestCase
562
+ include Ferret::Analysis
563
+
564
+ def test_custom_tokenizer()
565
+ input = "First Field,2nd Field, P a d d e d F i e l d "
566
+ t = MyCSVTokenizer.new(input)
567
+ assert_equal(Token.new("FIRST FIELD", 0, 11), t.next)
568
+ assert_equal(Token.new("2ND FIELD", 12, 21), t.next)
569
+ assert_equal(Token.new(" P A D D E D F I E L D ", 22, 48), t.next)
570
+ assert(! t.next())
571
+ t = AsciiLowerCaseFilter.new(MyCSVTokenizer.new(input))
572
+ assert_equal(Token.new("first field", 0, 11), t.next)
573
+ assert_equal(Token.new("2nd field", 12, 21), t.next)
574
+ assert_equal(Token.new(" p a d d e d f i e l d ", 22, 48), t.next)
575
+ assert(! t.next())
576
+ t = MyReverseTokenFilter.new(
577
+ AsciiLowerCaseFilter.new(MyCSVTokenizer.new(input)))
578
+ assert_equal(Token.new("dleif tsrif", 0, 11), t.next)
579
+ assert_equal(Token.new("dleif dn2", 12, 21), t.next)
580
+ assert_equal(Token.new(" d l e i f d e d d a p ", 22, 48), t.next)
581
+ t.text = "one,TWO,three"
582
+ assert_equal(Token.new("eno", 0, 3), t.next)
583
+ assert_equal(Token.new("owt", 4, 7), t.next)
584
+ assert_equal(Token.new("eerht", 8, 13), t.next)
585
+ t = AsciiLowerCaseFilter.new(
586
+ MyReverseTokenFilter.new(MyCSVTokenizer.new(input)))
587
+ assert_equal(Token.new("dleif tsrif", 0, 11), t.next)
588
+ assert_equal(Token.new("dleif dn2", 12, 21), t.next)
589
+ assert_equal(Token.new(" d l e i f d e d d a p ", 22, 48), t.next)
590
+ t.text = "one,TWO,three"
591
+ assert_equal(Token.new("eno", 0, 3), t.next)
592
+ assert_equal(Token.new("owt", 4, 7), t.next)
593
+ assert_equal(Token.new("eerht", 8, 13), t.next)
594
+ end
595
+ end
596
+
597
+ module Ferret::Analysis
598
+ class TokenFilter < TokenStream
599
+ protected
600
+ # Construct a token stream filtering the given input.
601
+ def initialize(input)
602
+ @input = input
603
+ end
604
+ end
605
+
606
+ # Normalizes token text to lower case.
607
+ class CapitalizeFilter < TokenFilter
608
+ def next()
609
+ t = @input.next()
610
+
611
+ return nil if (t.nil?)
612
+
613
+ t.text = t.text.capitalize
614
+
615
+ return t
616
+ end
617
+ end
618
+ end
619
+
620
+ class CustomFilterTest < Test::Unit::TestCase
621
+ include Ferret::Analysis
622
+
623
+ def test_custom_filter()
624
+ input = "This text SHOULD be capitalized ... I hope. :-S"
625
+ t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
626
+ assert_equal(Token.new("This", 0, 4), t.next)
627
+ assert_equal(Token.new("Text", 5, 9), t.next)
628
+ assert_equal(Token.new("Should", 10, 16), t.next)
629
+ assert_equal(Token.new("Be", 17, 19), t.next)
630
+ assert_equal(Token.new("Capitalized", 20, 31), t.next)
631
+ assert_equal(Token.new("I", 36, 37), t.next)
632
+ assert_equal(Token.new("Hope", 38, 42), t.next)
633
+ assert_equal(Token.new("S", 46, 47), t.next)
634
+ assert(! t.next())
635
+ t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
636
+ assert_equal(Token.new("This", 0, 4), t.next)
637
+ assert_equal(Token.new("Text", 5, 9), t.next)
638
+ assert_equal(Token.new("Should", 10, 16), t.next)
639
+ assert_equal(Token.new("Be", 17, 19), t.next)
640
+ assert_equal(Token.new("Capit", 20, 31), t.next)
641
+ assert_equal(Token.new("I", 36, 37), t.next)
642
+ assert_equal(Token.new("Hope", 38, 42), t.next)
643
+ assert_equal(Token.new("S", 46, 47), t.next)
644
+ assert(! t.next())
645
+ end
646
+ end