sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,548 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class AnalyzerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+
6
+ def test_analyzer()
7
+ input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
8
+ a = Analyzer.new()
9
+ t = a.token_stream("fieldname", input)
10
+ t2 = a.token_stream("fieldname", input)
11
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
12
+ assert_equal(Token.new("gmail", 9, 14), t.next())
13
+ assert_equal(Token.new("com", 15, 18), t.next())
14
+ assert_equal(Token.new("is", 19, 21), t.next())
15
+ assert_equal(Token.new("my", 22, 24), t.next())
16
+ assert_equal(Token.new("e", 25, 26), t.next())
17
+ assert_equal(Token.new("mail", 27, 31), t.next())
18
+ assert_equal(Token.new("address", 39, 46), t.next())
19
+ assert(! t.next())
20
+ assert_equal(Token.new("dbalmain", 0, 8), t2.next())
21
+ assert_equal(Token.new("gmail", 9, 14), t2.next())
22
+ assert_equal(Token.new("com", 15, 18), t2.next())
23
+ assert_equal(Token.new("is", 19, 21), t2.next())
24
+ assert_equal(Token.new("my", 22, 24), t2.next())
25
+ assert_equal(Token.new("e", 25, 26), t2.next())
26
+ assert_equal(Token.new("mail", 27, 31), t2.next())
27
+ assert_equal(Token.new("address", 39, 46), t2.next())
28
+ assert(! t2.next())
29
+ a = Analyzer.new(false)
30
+ t = a.token_stream("fieldname", input)
31
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
32
+ assert_equal(Token.new("gmail", 9, 14), t.next())
33
+ assert_equal(Token.new("com", 15, 18), t.next())
34
+ assert_equal(Token.new("is", 19, 21), t.next())
35
+ assert_equal(Token.new("My", 22, 24), t.next())
36
+ assert_equal(Token.new("E", 25, 26), t.next())
37
+ assert_equal(Token.new("Mail", 27, 31), t.next())
38
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
39
+ assert(! t.next())
40
+ end
41
+ end if (/utf-8/i =~ Ferret.locale)
42
+
43
+ class AsciiLetterAnalyzerTest < Test::Unit::TestCase
44
+ include Ferret::Analysis
45
+
46
+ def test_letter_analyzer()
47
+ input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
48
+ a = AsciiLetterAnalyzer.new()
49
+ t = a.token_stream("fieldname", input)
50
+ t2 = a.token_stream("fieldname", input)
51
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
52
+ assert_equal(Token.new("gmail", 9, 14), t.next())
53
+ assert_equal(Token.new("com", 15, 18), t.next())
54
+ assert_equal(Token.new("is", 19, 21), t.next())
55
+ assert_equal(Token.new("my", 22, 24), t.next())
56
+ assert_equal(Token.new("e", 25, 26), t.next())
57
+ assert_equal(Token.new("mail", 27, 31), t.next())
58
+ assert_equal(Token.new("address", 39, 46), t.next())
59
+ assert(! t.next())
60
+ assert_equal(Token.new("dbalmain", 0, 8), t2.next())
61
+ assert_equal(Token.new("gmail", 9, 14), t2.next())
62
+ assert_equal(Token.new("com", 15, 18), t2.next())
63
+ assert_equal(Token.new("is", 19, 21), t2.next())
64
+ assert_equal(Token.new("my", 22, 24), t2.next())
65
+ assert_equal(Token.new("e", 25, 26), t2.next())
66
+ assert_equal(Token.new("mail", 27, 31), t2.next())
67
+ assert_equal(Token.new("address", 39, 46), t2.next())
68
+ assert(! t2.next())
69
+ a = AsciiLetterAnalyzer.new(false)
70
+ t = a.token_stream("fieldname", input)
71
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
72
+ assert_equal(Token.new("gmail", 9, 14), t.next())
73
+ assert_equal(Token.new("com", 15, 18), t.next())
74
+ assert_equal(Token.new("is", 19, 21), t.next())
75
+ assert_equal(Token.new("My", 22, 24), t.next())
76
+ assert_equal(Token.new("E", 25, 26), t.next())
77
+ assert_equal(Token.new("Mail", 27, 31), t.next())
78
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
79
+ assert(! t.next())
80
+ end
81
+ end
82
+
83
+ class LetterAnalyzerTest < Test::Unit::TestCase
84
+ include Ferret::Analysis
85
+
86
+ def test_letter_analyzer()
87
+ Ferret.locale = ""
88
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
89
+ a = LetterAnalyzer.new(false)
90
+ t = a.token_stream("fieldname", input)
91
+ t2 = a.token_stream("fieldname", input)
92
+ assert_equal(Token.new("DBalmän", 0, 8), t.next)
93
+ assert_equal(Token.new("gmail", 9, 14), t.next)
94
+ assert_equal(Token.new("com", 15, 18), t.next)
95
+ assert_equal(Token.new("is", 19, 21), t.next)
96
+ assert_equal(Token.new("My", 22, 24), t.next)
97
+ assert_equal(Token.new("e", 25, 26), t.next)
98
+ assert_equal(Token.new("mail", 27, 31), t.next)
99
+ assert_equal(Token.new("address", 40, 47), t.next)
100
+ assert_equal(Token.new("ÁÄGÇ", 55, 62), t.next)
101
+ assert_equal(Token.new("ÊËÌ", 64, 70), t.next)
102
+ assert_equal(Token.new("ÚØÃ", 72, 78), t.next)
103
+ assert_equal(Token.new("ÖÎÍ", 80, 86), t.next)
104
+ assert(! t.next())
105
+ assert_equal(Token.new("DBalmän", 0, 8), t2.next)
106
+ assert_equal(Token.new("gmail", 9, 14), t2.next)
107
+ assert_equal(Token.new("com", 15, 18), t2.next)
108
+ assert_equal(Token.new("is", 19, 21), t2.next)
109
+ assert_equal(Token.new("My", 22, 24), t2.next)
110
+ assert_equal(Token.new("e", 25, 26), t2.next)
111
+ assert_equal(Token.new("mail", 27, 31), t2.next)
112
+ assert_equal(Token.new("address", 40, 47), t2.next)
113
+ assert_equal(Token.new("ÁÄGÇ", 55, 62), t2.next)
114
+ assert_equal(Token.new("ÊËÌ", 64, 70), t2.next)
115
+ assert_equal(Token.new("ÚØÃ", 72, 78), t2.next)
116
+ assert_equal(Token.new("ÖÎÍ", 80, 86), t2.next)
117
+ assert(! t2.next())
118
+ a = LetterAnalyzer.new()
119
+ t = a.token_stream("fieldname", input)
120
+ assert_equal(Token.new("dbalmän", 0, 8), t.next)
121
+ assert_equal(Token.new("gmail", 9, 14), t.next)
122
+ assert_equal(Token.new("com", 15, 18), t.next)
123
+ assert_equal(Token.new("is", 19, 21), t.next)
124
+ assert_equal(Token.new("my", 22, 24), t.next)
125
+ assert_equal(Token.new("e", 25, 26), t.next)
126
+ assert_equal(Token.new("mail", 27, 31), t.next)
127
+ assert_equal(Token.new("address", 40, 47), t.next)
128
+ assert_equal(Token.new("áägç", 55, 62), t.next)
129
+ assert_equal(Token.new("êëì", 64, 70), t.next)
130
+ assert_equal(Token.new("úøã", 72, 78), t.next)
131
+ assert_equal(Token.new("öîí", 80, 86), t.next)
132
+ assert(! t.next())
133
+ end
134
+ end if (/utf-8/i =~ Ferret.locale)
135
+
136
+ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
137
+ include Ferret::Analysis
138
+
139
+ def test_white_space_analyzer()
140
+ input = 'DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23#@$'
141
+ a = AsciiWhiteSpaceAnalyzer.new()
142
+ t = a.token_stream("fieldname", input)
143
+ t2 = a.token_stream("fieldname", input)
144
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
145
+ assert_equal(Token.new('is', 19, 21), t.next)
146
+ assert_equal(Token.new('My', 22, 24), t.next)
147
+ assert_equal(Token.new('E-Mail', 25, 31), t.next)
148
+ assert_equal(Token.new('52', 32, 34), t.next)
149
+ assert_equal(Token.new('#$', 37, 39), t.next)
150
+ assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
151
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
152
+ assert(! t.next())
153
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t2.next)
154
+ assert_equal(Token.new('is', 19, 21), t2.next)
155
+ assert_equal(Token.new('My', 22, 24), t2.next)
156
+ assert_equal(Token.new('E-Mail', 25, 31), t2.next)
157
+ assert_equal(Token.new('52', 32, 34), t2.next)
158
+ assert_equal(Token.new('#$', 37, 39), t2.next)
159
+ assert_equal(Token.new('ADDRESS.', 40, 48), t2.next)
160
+ assert_equal(Token.new('23#@$', 49, 54), t2.next)
161
+ assert(! t2.next())
162
+ a = AsciiWhiteSpaceAnalyzer.new(true)
163
+ t = a.token_stream("fieldname", input)
164
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
165
+ assert_equal(Token.new('is', 19, 21), t.next)
166
+ assert_equal(Token.new('my', 22, 24), t.next)
167
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
168
+ assert_equal(Token.new('52', 32, 34), t.next)
169
+ assert_equal(Token.new('#$', 37, 39), t.next)
170
+ assert_equal(Token.new('address.', 40, 48), t.next)
171
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
172
+ assert(! t.next())
173
+ end
174
+ end
175
+
176
+ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
177
+ include Ferret::Analysis
178
+
179
+ def test_white_space_analyzer()
180
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
181
+ a = WhiteSpaceAnalyzer.new()
182
+ t = a.token_stream("fieldname", input)
183
+ t2 = a.token_stream("fieldname", input)
184
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
185
+ assert_equal(Token.new('is', 19, 21), t.next)
186
+ assert_equal(Token.new('My', 22, 24), t.next)
187
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
188
+ assert_equal(Token.new('52', 32, 34), t.next)
189
+ assert_equal(Token.new('#$', 37, 39), t.next)
190
+ assert_equal(Token.new('address.', 40, 48), t.next)
191
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
192
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
193
+ assert(! t.next())
194
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t2.next)
195
+ assert_equal(Token.new('is', 19, 21), t2.next)
196
+ assert_equal(Token.new('My', 22, 24), t2.next)
197
+ assert_equal(Token.new('e-mail', 25, 31), t2.next)
198
+ assert_equal(Token.new('52', 32, 34), t2.next)
199
+ assert_equal(Token.new('#$', 37, 39), t2.next)
200
+ assert_equal(Token.new('address.', 40, 48), t2.next)
201
+ assert_equal(Token.new('23#@$', 49, 54), t2.next)
202
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t2.next)
203
+ assert(! t2.next())
204
+ a = WhiteSpaceAnalyzer.new(true)
205
+ t = a.token_stream("fieldname", input)
206
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
207
+ assert_equal(Token.new('is', 19, 21), t.next)
208
+ assert_equal(Token.new('my', 22, 24), t.next)
209
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
210
+ assert_equal(Token.new('52', 32, 34), t.next)
211
+ assert_equal(Token.new('#$', 37, 39), t.next)
212
+ assert_equal(Token.new('address.', 40, 48), t.next)
213
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
214
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
215
+ assert(! t.next())
216
+ end
217
+ end if (/utf-8/i =~ Ferret.locale)
218
+
219
+ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
220
+ include Ferret::Analysis
221
+
222
+ def test_standard_analyzer()
223
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
224
+ a = AsciiStandardAnalyzer.new()
225
+ t = a.token_stream("fieldname", input)
226
+ t2 = a.token_stream("fieldname", input)
227
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
228
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
229
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
230
+ assert_equal(Token.new('mail', 27, 31), t.next)
231
+ assert_equal(Token.new('52', 32, 34), t.next)
232
+ assert_equal(Token.new('address', 40, 47), t.next)
233
+ assert_equal(Token.new('23', 49, 51), t.next)
234
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
235
+ assert_equal(Token.new('tnt', 86, 91), t.next)
236
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
237
+ assert(! t.next())
238
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t2.next)
239
+ assert_equal(Token.new('email', 25, 31, 3), t2.next)
240
+ assert_equal(Token.new('e', 25, 26, 0), t2.next)
241
+ assert_equal(Token.new('mail', 27, 31), t2.next)
242
+ assert_equal(Token.new('52', 32, 34), t2.next)
243
+ assert_equal(Token.new('address', 40, 47), t2.next)
244
+ assert_equal(Token.new('23', 49, 51), t2.next)
245
+ assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
246
+ assert_equal(Token.new('tnt', 86, 91), t2.next)
247
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
248
+ assert(! t2.next())
249
+ a = AsciiStandardAnalyzer.new(ENGLISH_STOP_WORDS, false)
250
+ t = a.token_stream("fieldname", input)
251
+ t2 = a.token_stream("fieldname", input)
252
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
253
+ assert_equal(Token.new('My', 22, 24), t.next)
254
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
255
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
256
+ assert_equal(Token.new('mail', 27, 31), t.next)
257
+ assert_equal(Token.new('52', 32, 34), t.next)
258
+ assert_equal(Token.new('Address', 40, 47), t.next)
259
+ assert_equal(Token.new('23', 49, 51), t.next)
260
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
261
+ assert_equal(Token.new('TNT', 86, 91), t.next)
262
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
263
+ assert(! t.next())
264
+ end
265
+ end
266
+
267
+ class StandardAnalyzerTest < Test::Unit::TestCase
268
+ include Ferret::Analysis
269
+
270
+ def test_standard_analyzer()
271
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
272
+ a = StandardAnalyzer.new()
273
+ t = a.token_stream("fieldname", input)
274
+ t2 = a.token_stream("fieldname", input)
275
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
276
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
277
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
278
+ assert_equal(Token.new('mail', 27, 31), t.next)
279
+ assert_equal(Token.new('address', 40, 47), t.next)
280
+ assert_equal(Token.new('23', 49, 51), t.next)
281
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
282
+ assert_equal(Token.new('tnt', 86, 91), t.next)
283
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
284
+ assert_equal(Token.new('23', 111, 113), t.next)
285
+ assert_equal(Token.new('áägç', 117, 124), t.next)
286
+ assert_equal(Token.new('êëì', 126, 132), t.next)
287
+ assert_equal(Token.new('úøã', 134, 140), t.next)
288
+ assert_equal(Token.new('öîí', 142, 148), t.next)
289
+ assert(! t.next())
290
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t2.next)
291
+ assert_equal(Token.new('email', 25, 31, 3), t2.next)
292
+ assert_equal(Token.new('e', 25, 26, 0), t2.next)
293
+ assert_equal(Token.new('mail', 27, 31), t2.next)
294
+ assert_equal(Token.new('address', 40, 47), t2.next)
295
+ assert_equal(Token.new('23', 49, 51), t2.next)
296
+ assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
297
+ assert_equal(Token.new('tnt', 86, 91), t2.next)
298
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
299
+ assert_equal(Token.new('23', 111, 113), t2.next)
300
+ assert_equal(Token.new('áägç', 117, 124), t2.next)
301
+ assert_equal(Token.new('êëì', 126, 132), t2.next)
302
+ assert_equal(Token.new('úøã', 134, 140), t2.next)
303
+ assert_equal(Token.new('öîí', 142, 148), t2.next)
304
+ assert(! t2.next())
305
+ a = StandardAnalyzer.new(nil, false)
306
+ t = a.token_stream("fieldname", input)
307
+ assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
308
+ assert_equal(Token.new('My', 22, 24), t.next)
309
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
310
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
311
+ assert_equal(Token.new('mail', 27, 31), t.next)
312
+ assert_equal(Token.new('Address', 40, 47), t.next)
313
+ assert_equal(Token.new('23', 49, 51), t.next)
314
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
315
+ assert_equal(Token.new('TNT', 86, 91), t.next)
316
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
317
+ assert_equal(Token.new('23', 111, 113), t.next)
318
+ assert_equal(Token.new('ÁÄGÇ', 117, 124), t.next)
319
+ assert_equal(Token.new('ÊËÌ', 126, 132), t.next)
320
+ assert_equal(Token.new('ÚØÃ', 134, 140), t.next)
321
+ assert_equal(Token.new('ÖÎÍ', 142, 148), t.next)
322
+ assert(! t.next())
323
+ a = StandardAnalyzer.new(["e-mail", "23", "tnt"])
324
+ t = a.token_stream("fieldname", input)
325
+ t2 = a.token_stream("fieldname", input)
326
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
327
+ assert_equal(Token.new('is', 19, 21), t.next)
328
+ assert_equal(Token.new('my', 22, 24), t.next)
329
+ assert_equal(Token.new('and', 32, 35), t.next)
330
+ assert_equal(Token.new('the', 36, 39), t.next)
331
+ assert_equal(Token.new('address', 40, 47), t.next)
332
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
333
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
334
+ assert_equal(Token.new('áägç', 117, 124), t.next)
335
+ assert_equal(Token.new('êëì', 126, 132), t.next)
336
+ assert_equal(Token.new('úøã', 134, 140), t.next)
337
+ assert_equal(Token.new('öîí', 142, 148), t.next)
338
+ assert(! t.next())
339
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t2.next)
340
+ assert_equal(Token.new('is', 19, 21), t2.next)
341
+ assert_equal(Token.new('my', 22, 24), t2.next)
342
+ assert_equal(Token.new('and', 32, 35), t2.next)
343
+ assert_equal(Token.new('the', 36, 39), t2.next)
344
+ assert_equal(Token.new('address', 40, 47), t2.next)
345
+ assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
346
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
347
+ assert_equal(Token.new('áägç', 117, 124), t2.next)
348
+ assert_equal(Token.new('êëì', 126, 132), t2.next)
349
+ assert_equal(Token.new('úøã', 134, 140), t2.next)
350
+ assert_equal(Token.new('öîí', 142, 148), t2.next)
351
+ assert(! t2.next())
352
+ end
353
+ end if (/utf-8/i =~ Ferret.locale)
354
+
355
+ class PerFieldAnalyzerTest < Test::Unit::TestCase
356
+ include Ferret::Analysis
357
+ def test_per_field_analyzer()
358
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ address. 23#@$'
359
+ pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
360
+ pfa['white'] = WhiteSpaceAnalyzer.new(false)
361
+ pfa['white_l'] = WhiteSpaceAnalyzer.new(true)
362
+ pfa['letter'] = LetterAnalyzer.new(false)
363
+ pfa.add_field('letter', LetterAnalyzer.new(true))
364
+ pfa.add_field('letter_u', LetterAnalyzer.new(false))
365
+ t = pfa.token_stream('white', input)
366
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
367
+ assert_equal(Token.new('is', 19, 21), t.next)
368
+ assert_equal(Token.new('My', 22, 24), t.next)
369
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
370
+ assert_equal(Token.new('52', 32, 34), t.next)
371
+ assert_equal(Token.new('#$', 37, 39), t.next)
372
+ assert_equal(Token.new('address.', 40, 48), t.next)
373
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
374
+ assert(! t.next())
375
+ t = pfa.token_stream('white_l', input)
376
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
377
+ assert_equal(Token.new('is', 19, 21), t.next)
378
+ assert_equal(Token.new('my', 22, 24), t.next)
379
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
380
+ assert_equal(Token.new('52', 32, 34), t.next)
381
+ assert_equal(Token.new('#$', 37, 39), t.next)
382
+ assert_equal(Token.new('address.', 40, 48), t.next)
383
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
384
+ assert(! t.next())
385
+ t = pfa.token_stream('letter_u', input)
386
+ assert_equal(Token.new('DBalmain', 0, 8), t.next)
387
+ assert_equal(Token.new('gmail', 9, 14), t.next)
388
+ assert_equal(Token.new('com', 15, 18), t.next)
389
+ assert_equal(Token.new('is', 19, 21), t.next)
390
+ assert_equal(Token.new('My', 22, 24), t.next)
391
+ assert_equal(Token.new('e', 25, 26), t.next)
392
+ assert_equal(Token.new('mail', 27, 31), t.next)
393
+ assert_equal(Token.new('address', 40, 47), t.next)
394
+ assert(! t.next())
395
+ t = pfa.token_stream('letter', input)
396
+ assert_equal(Token.new('dbalmain', 0, 8), t.next)
397
+ assert_equal(Token.new('gmail', 9, 14), t.next)
398
+ assert_equal(Token.new('com', 15, 18), t.next)
399
+ assert_equal(Token.new('is', 19, 21), t.next)
400
+ assert_equal(Token.new('my', 22, 24), t.next)
401
+ assert_equal(Token.new('e', 25, 26), t.next)
402
+ assert_equal(Token.new('mail', 27, 31), t.next)
403
+ assert_equal(Token.new('address', 40, 47), t.next)
404
+ assert(! t.next())
405
+ t = pfa.token_stream('XXX', input) # should use default StandardAnalzyer
406
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
407
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
408
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
409
+ assert_equal(Token.new('mail', 27, 31), t.next)
410
+ assert_equal(Token.new('52', 32, 34), t.next)
411
+ assert_equal(Token.new('address', 40, 47), t.next)
412
+ assert_equal(Token.new('23', 49, 51), t.next)
413
+ assert(! t.next())
414
+ end
415
+ end
416
+
417
+ class RegExpAnalyzerTest < Test::Unit::TestCase
418
+ include Ferret::Analysis
419
+
420
+ def test_reg_exp_analyzer()
421
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
422
+ a = RegExpAnalyzer.new()
423
+ t = a.token_stream('XXX', input)
424
+ t2 = a.token_stream('XXX', "one_Two three")
425
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
426
+ assert_equal(Token.new('is', 19, 21), t.next)
427
+ assert_equal(Token.new('my', 22, 24), t.next)
428
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
429
+ assert_equal(Token.new('52', 32, 34), t.next)
430
+ assert_equal(Token.new('address', 40, 47), t.next)
431
+ assert_equal(Token.new('23', 49, 51), t.next)
432
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
433
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
434
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
435
+ assert_equal(Token.new('23', 116, 118), t.next)
436
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
437
+ assert(! t.next())
438
+ t = t2
439
+ assert_equal(Token.new("one_two", 0, 7), t.next())
440
+ assert_equal(Token.new("three", 8, 13), t.next())
441
+ assert(! t.next())
442
+ a = RegExpAnalyzer.new(/\w{2,}/, false)
443
+ t = a.token_stream('XXX', input)
444
+ t2 = a.token_stream('XXX', "one Two three")
445
+ assert_equal(Token.new('DBalmain', 0, 8), t.next)
446
+ assert_equal(Token.new('gmail', 9, 14), t.next)
447
+ assert_equal(Token.new('com', 15, 18), t.next)
448
+ assert_equal(Token.new('is', 19, 21), t.next)
449
+ assert_equal(Token.new('My', 22, 24), t.next)
450
+ assert_equal(Token.new('mail', 27, 31), t.next)
451
+ assert_equal(Token.new('52', 32, 34), t.next)
452
+ assert_equal(Token.new('Address', 40, 47), t.next)
453
+ assert_equal(Token.new('23', 49, 51), t.next)
454
+ assert_equal(Token.new('http', 55, 59), t.next)
455
+ assert_equal(Token.new('www', 62, 65), t.next)
456
+ assert_equal(Token.new('google', 66, 72), t.next)
457
+ assert_equal(Token.new('com', 73, 76), t.next)
458
+ assert_equal(Token.new('RESULT_3', 77, 85), t.next)
459
+ assert_equal(Token.new('html', 86, 90), t.next)
460
+ assert_equal(Token.new('123', 98, 101), t.next)
461
+ assert_equal(Token.new('1235', 102, 106), t.next)
462
+ assert_equal(Token.new('ASD', 107, 110), t.next)
463
+ assert_equal(Token.new('1234', 111, 115), t.next)
464
+ assert_equal(Token.new('23', 116, 118), t.next)
465
+ assert_equal(Token.new('Rob', 119, 122), t.next)
466
+ assert(! t.next())
467
+ assert_equal(Token.new("one", 0, 3), t2.next())
468
+ assert_equal(Token.new("Two", 4, 7), t2.next())
469
+ assert_equal(Token.new("three", 8, 13), t2.next())
470
+ assert(! t2.next())
471
+ a = RegExpAnalyzer.new() do |str|
472
+ if str =~ /^[[:alpha:]]\.([[:alpha:]]\.)+$/
473
+ str.gsub!(/\./, '')
474
+ elsif str =~ /'[sS]$/
475
+ str.gsub!(/'[sS]$/, '')
476
+ end
477
+ str
478
+ end
479
+ t = a.token_stream('XXX', input)
480
+ t2 = a.token_stream('XXX', "one's don't T.N.T.")
481
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
482
+ assert_equal(Token.new('is', 19, 21), t.next)
483
+ assert_equal(Token.new('my', 22, 24), t.next)
484
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
485
+ assert_equal(Token.new('52', 32, 34), t.next)
486
+ assert_equal(Token.new('address', 40, 47), t.next)
487
+ assert_equal(Token.new('23', 49, 51), t.next)
488
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
489
+ assert_equal(Token.new('tnt', 91, 97), t.next)
490
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
491
+ assert_equal(Token.new('23', 116, 118), t.next)
492
+ assert_equal(Token.new('rob', 119, 124), t.next)
493
+ assert(! t.next())
494
+ assert_equal(Token.new("one", 0, 5), t2.next())
495
+ assert_equal(Token.new("don't", 6, 11), t2.next())
496
+ assert_equal(Token.new("tnt", 12, 18), t2.next())
497
+ assert(! t2.next())
498
+ end
499
+ end
500
+
501
+ module Ferret::Analysis
502
+ class StemmingStandardAnalyzer < StandardAnalyzer
503
+ def token_stream(field, text)
504
+ StemFilter.new(super)
505
+ end
506
+ end
507
+ end
508
+
509
+ class CustomAnalyzerTest < Test::Unit::TestCase
510
+ include Ferret::Analysis
511
+
512
+ def test_custom_filter()
513
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
514
+ a = StemmingStandardAnalyzer.new()
515
+ t = a.token_stream("fieldname", input)
516
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
517
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
518
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
519
+ assert_equal(Token.new('mail', 27, 31), t.next)
520
+ assert_equal(Token.new('address', 40, 47), t.next)
521
+ assert_equal(Token.new('23', 49, 51), t.next)
522
+ assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
523
+ assert_equal(Token.new('tnt', 86, 91), t.next)
524
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
525
+ assert_equal(Token.new('23', 111, 113), t.next)
526
+ assert_equal(Token.new('áägç', 117, 124), t.next)
527
+ assert_equal(Token.new('êëì', 126, 132), t.next)
528
+ assert_equal(Token.new('úøã', 134, 140), t.next)
529
+ assert_equal(Token.new('öîí', 142, 148), t.next)
530
+ assert(! t.next())
531
+ input = "Debate Debates DEBATED DEBating Debater";
532
+ t = a.token_stream("fieldname", input)
533
+ assert_equal(Token.new("debat", 0, 6), t.next)
534
+ assert_equal(Token.new("debat", 7, 14), t.next)
535
+ assert_equal(Token.new("debat", 15, 22), t.next)
536
+ assert_equal(Token.new("debat", 23, 31), t.next)
537
+ assert_equal(Token.new("debat", 32, 39), t.next)
538
+ assert(! t.next())
539
+ input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
540
+ t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
541
+ assert_equal(Token.new("dêbate", 0, 7), t.next)
542
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
543
+ assert_equal(Token.new("dêbate", 17, 25), t.next)
544
+ assert_equal(Token.new("dêbate", 26, 35), t.next)
545
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
546
+ assert(! t.next())
547
+ end
548
+ end if (/utf-8/i =~ Ferret.locale)