jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
@@ -0,0 +1,550 @@
1
+ # encoding: utf-8
2
+
3
+ require File.dirname(__FILE__) + "/../../test_helper"
4
+
5
+ class AnalyzerTest < Test::Unit::TestCase
6
+ include Ferret::Analysis
7
+
8
+ def test_analyzer()
9
+ input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#!$'
10
+ a = Analyzer.new()
11
+ t = a.token_stream("fieldname", input)
12
+ t2 = a.token_stream("fieldname", input)
13
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
14
+ assert_equal(Token.new("gmail", 9, 14), t.next())
15
+ assert_equal(Token.new("com", 15, 18), t.next())
16
+ assert_equal(Token.new("is", 19, 21), t.next())
17
+ assert_equal(Token.new("my", 22, 24), t.next())
18
+ assert_equal(Token.new("e", 25, 26), t.next())
19
+ assert_equal(Token.new("mail", 27, 31), t.next())
20
+ assert_equal(Token.new("address", 39, 46), t.next())
21
+ assert(! t.next())
22
+ assert_equal(Token.new("dbalmain", 0, 8), t2.next())
23
+ assert_equal(Token.new("gmail", 9, 14), t2.next())
24
+ assert_equal(Token.new("com", 15, 18), t2.next())
25
+ assert_equal(Token.new("is", 19, 21), t2.next())
26
+ assert_equal(Token.new("my", 22, 24), t2.next())
27
+ assert_equal(Token.new("e", 25, 26), t2.next())
28
+ assert_equal(Token.new("mail", 27, 31), t2.next())
29
+ assert_equal(Token.new("address", 39, 46), t2.next())
30
+ assert(! t2.next())
31
+ a = Analyzer.new(false)
32
+ t = a.token_stream("fieldname", input)
33
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
34
+ assert_equal(Token.new("gmail", 9, 14), t.next())
35
+ assert_equal(Token.new("com", 15, 18), t.next())
36
+ assert_equal(Token.new("is", 19, 21), t.next())
37
+ assert_equal(Token.new("My", 22, 24), t.next())
38
+ assert_equal(Token.new("E", 25, 26), t.next())
39
+ assert_equal(Token.new("Mail", 27, 31), t.next())
40
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
41
+ assert(! t.next())
42
+ end
43
+ end if (/utf-8/i =~ Ferret.locale)
44
+
45
+ class AsciiLetterAnalyzerTest < Test::Unit::TestCase
46
+ include Ferret::Analysis
47
+
48
+ def test_letter_analyzer()
49
+ input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#!$'
50
+ a = AsciiLetterAnalyzer.new()
51
+ t = a.token_stream("fieldname", input)
52
+ t2 = a.token_stream("fieldname", input)
53
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
54
+ assert_equal(Token.new("gmail", 9, 14), t.next())
55
+ assert_equal(Token.new("com", 15, 18), t.next())
56
+ assert_equal(Token.new("is", 19, 21), t.next())
57
+ assert_equal(Token.new("my", 22, 24), t.next())
58
+ assert_equal(Token.new("e", 25, 26), t.next())
59
+ assert_equal(Token.new("mail", 27, 31), t.next())
60
+ assert_equal(Token.new("address", 39, 46), t.next())
61
+ assert(! t.next())
62
+ assert_equal(Token.new("dbalmain", 0, 8), t2.next())
63
+ assert_equal(Token.new("gmail", 9, 14), t2.next())
64
+ assert_equal(Token.new("com", 15, 18), t2.next())
65
+ assert_equal(Token.new("is", 19, 21), t2.next())
66
+ assert_equal(Token.new("my", 22, 24), t2.next())
67
+ assert_equal(Token.new("e", 25, 26), t2.next())
68
+ assert_equal(Token.new("mail", 27, 31), t2.next())
69
+ assert_equal(Token.new("address", 39, 46), t2.next())
70
+ assert(! t2.next())
71
+ a = AsciiLetterAnalyzer.new(false)
72
+ t = a.token_stream("fieldname", input)
73
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
74
+ assert_equal(Token.new("gmail", 9, 14), t.next())
75
+ assert_equal(Token.new("com", 15, 18), t.next())
76
+ assert_equal(Token.new("is", 19, 21), t.next())
77
+ assert_equal(Token.new("My", 22, 24), t.next())
78
+ assert_equal(Token.new("E", 25, 26), t.next())
79
+ assert_equal(Token.new("Mail", 27, 31), t.next())
80
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
81
+ assert(! t.next())
82
+ end
83
+ end
84
+
85
+ class LetterAnalyzerTest < Test::Unit::TestCase
86
+ include Ferret::Analysis
87
+
88
+ def test_letter_analyzer()
89
+ Ferret.locale = ""
90
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
91
+ a = LetterAnalyzer.new(false)
92
+ t = a.token_stream("fieldname", input)
93
+ t2 = a.token_stream("fieldname", input)
94
+ assert_equal(Token.new("DBalmän", 0, 8), t.next)
95
+ assert_equal(Token.new("gmail", 9, 14), t.next)
96
+ assert_equal(Token.new("com", 15, 18), t.next)
97
+ assert_equal(Token.new("is", 19, 21), t.next)
98
+ assert_equal(Token.new("My", 22, 24), t.next)
99
+ assert_equal(Token.new("e", 25, 26), t.next)
100
+ assert_equal(Token.new("mail", 27, 31), t.next)
101
+ assert_equal(Token.new("address", 40, 47), t.next)
102
+ assert_equal(Token.new("ÁÄGÇ", 55, 62), t.next)
103
+ assert_equal(Token.new("ÊËÌ", 64, 70), t.next)
104
+ assert_equal(Token.new("ÚØÃ", 72, 78), t.next)
105
+ assert_equal(Token.new("ÖÎÍ", 80, 86), t.next)
106
+ assert(! t.next())
107
+ assert_equal(Token.new("DBalmän", 0, 8), t2.next)
108
+ assert_equal(Token.new("gmail", 9, 14), t2.next)
109
+ assert_equal(Token.new("com", 15, 18), t2.next)
110
+ assert_equal(Token.new("is", 19, 21), t2.next)
111
+ assert_equal(Token.new("My", 22, 24), t2.next)
112
+ assert_equal(Token.new("e", 25, 26), t2.next)
113
+ assert_equal(Token.new("mail", 27, 31), t2.next)
114
+ assert_equal(Token.new("address", 40, 47), t2.next)
115
+ assert_equal(Token.new("ÁÄGÇ", 55, 62), t2.next)
116
+ assert_equal(Token.new("ÊËÌ", 64, 70), t2.next)
117
+ assert_equal(Token.new("ÚØÃ", 72, 78), t2.next)
118
+ assert_equal(Token.new("ÖÎÍ", 80, 86), t2.next)
119
+ assert(! t2.next())
120
+ a = LetterAnalyzer.new()
121
+ t = a.token_stream("fieldname", input)
122
+ assert_equal(Token.new("dbalmän", 0, 8), t.next)
123
+ assert_equal(Token.new("gmail", 9, 14), t.next)
124
+ assert_equal(Token.new("com", 15, 18), t.next)
125
+ assert_equal(Token.new("is", 19, 21), t.next)
126
+ assert_equal(Token.new("my", 22, 24), t.next)
127
+ assert_equal(Token.new("e", 25, 26), t.next)
128
+ assert_equal(Token.new("mail", 27, 31), t.next)
129
+ assert_equal(Token.new("address", 40, 47), t.next)
130
+ assert_equal(Token.new("áägç", 55, 62), t.next)
131
+ assert_equal(Token.new("êëì", 64, 70), t.next)
132
+ assert_equal(Token.new("úøã", 72, 78), t.next)
133
+ assert_equal(Token.new("öîí", 80, 86), t.next)
134
+ assert(! t.next())
135
+ end
136
+ end if (/utf-8/i =~ Ferret.locale)
137
+
138
+ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
139
+ include Ferret::Analysis
140
+
141
+ def test_white_space_analyzer()
142
+ input = 'DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23#!$'
143
+ a = AsciiWhiteSpaceAnalyzer.new()
144
+ t = a.token_stream("fieldname", input)
145
+ t2 = a.token_stream("fieldname", input)
146
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
147
+ assert_equal(Token.new('is', 19, 21), t.next)
148
+ assert_equal(Token.new('My', 22, 24), t.next)
149
+ assert_equal(Token.new('E-Mail', 25, 31), t.next)
150
+ assert_equal(Token.new('52', 32, 34), t.next)
151
+ assert_equal(Token.new('#$', 37, 39), t.next)
152
+ assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
153
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
154
+ assert(! t.next())
155
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t2.next)
156
+ assert_equal(Token.new('is', 19, 21), t2.next)
157
+ assert_equal(Token.new('My', 22, 24), t2.next)
158
+ assert_equal(Token.new('E-Mail', 25, 31), t2.next)
159
+ assert_equal(Token.new('52', 32, 34), t2.next)
160
+ assert_equal(Token.new('#$', 37, 39), t2.next)
161
+ assert_equal(Token.new('ADDRESS.', 40, 48), t2.next)
162
+ assert_equal(Token.new('23#!$', 49, 54), t2.next)
163
+ assert(! t2.next())
164
+ a = AsciiWhiteSpaceAnalyzer.new(true)
165
+ t = a.token_stream("fieldname", input)
166
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
167
+ assert_equal(Token.new('is', 19, 21), t.next)
168
+ assert_equal(Token.new('my', 22, 24), t.next)
169
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
170
+ assert_equal(Token.new('52', 32, 34), t.next)
171
+ assert_equal(Token.new('#$', 37, 39), t.next)
172
+ assert_equal(Token.new('address.', 40, 48), t.next)
173
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
174
+ assert(! t.next())
175
+ end
176
+ end
177
+
178
+ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
179
+ include Ferret::Analysis
180
+
181
+ def test_white_space_analyzer()
182
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
183
+ a = WhiteSpaceAnalyzer.new()
184
+ t = a.token_stream("fieldname", input)
185
+ t2 = a.token_stream("fieldname", input)
186
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
187
+ assert_equal(Token.new('is', 19, 21), t.next)
188
+ assert_equal(Token.new('My', 22, 24), t.next)
189
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
190
+ assert_equal(Token.new('52', 32, 34), t.next)
191
+ assert_equal(Token.new('#$', 37, 39), t.next)
192
+ assert_equal(Token.new('address.', 40, 48), t.next)
193
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
194
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
195
+ assert(! t.next())
196
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t2.next)
197
+ assert_equal(Token.new('is', 19, 21), t2.next)
198
+ assert_equal(Token.new('My', 22, 24), t2.next)
199
+ assert_equal(Token.new('e-mail', 25, 31), t2.next)
200
+ assert_equal(Token.new('52', 32, 34), t2.next)
201
+ assert_equal(Token.new('#$', 37, 39), t2.next)
202
+ assert_equal(Token.new('address.', 40, 48), t2.next)
203
+ assert_equal(Token.new('23#!$', 49, 54), t2.next)
204
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t2.next)
205
+ assert(! t2.next())
206
+ a = WhiteSpaceAnalyzer.new(true)
207
+ t = a.token_stream("fieldname", input)
208
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
209
+ assert_equal(Token.new('is', 19, 21), t.next)
210
+ assert_equal(Token.new('my', 22, 24), t.next)
211
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
212
+ assert_equal(Token.new('52', 32, 34), t.next)
213
+ assert_equal(Token.new('#$', 37, 39), t.next)
214
+ assert_equal(Token.new('address.', 40, 48), t.next)
215
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
216
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
217
+ assert(! t.next())
218
+ end
219
+ end if (/utf-8/i =~ Ferret.locale)
220
+
221
+ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
222
+ include Ferret::Analysis
223
+
224
+ def test_standard_analyzer()
225
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
226
+ a = AsciiStandardAnalyzer.new()
227
+ t = a.token_stream("fieldname", input)
228
+ t2 = a.token_stream("fieldname", input)
229
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
230
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
231
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
232
+ assert_equal(Token.new('mail', 27, 31), t.next)
233
+ assert_equal(Token.new('52', 32, 34), t.next)
234
+ assert_equal(Token.new('address', 40, 47), t.next)
235
+ assert_equal(Token.new('23', 49, 51), t.next)
236
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
237
+ assert_equal(Token.new('tnt', 86, 91), t.next)
238
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
239
+ assert(! t.next())
240
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t2.next)
241
+ assert_equal(Token.new('email', 25, 31, 3), t2.next)
242
+ assert_equal(Token.new('e', 25, 26, 0), t2.next)
243
+ assert_equal(Token.new('mail', 27, 31), t2.next)
244
+ assert_equal(Token.new('52', 32, 34), t2.next)
245
+ assert_equal(Token.new('address', 40, 47), t2.next)
246
+ assert_equal(Token.new('23', 49, 51), t2.next)
247
+ assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
248
+ assert_equal(Token.new('tnt', 86, 91), t2.next)
249
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
250
+ assert(! t2.next())
251
+ a = AsciiStandardAnalyzer.new(ENGLISH_STOP_WORDS, false)
252
+ t = a.token_stream("fieldname", input)
253
+ t2 = a.token_stream("fieldname", input)
254
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
255
+ assert_equal(Token.new('My', 22, 24), t.next)
256
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
257
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
258
+ assert_equal(Token.new('mail', 27, 31), t.next)
259
+ assert_equal(Token.new('52', 32, 34), t.next)
260
+ assert_equal(Token.new('Address', 40, 47), t.next)
261
+ assert_equal(Token.new('23', 49, 51), t.next)
262
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
263
+ assert_equal(Token.new('TNT', 86, 91), t.next)
264
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
265
+ assert(! t.next())
266
+ end
267
+ end
268
+
269
+ class StandardAnalyzerTest < Test::Unit::TestCase
270
+ include Ferret::Analysis
271
+
272
+ def test_standard_analyzer()
273
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
274
+ a = StandardAnalyzer.new()
275
+ t = a.token_stream("fieldname", input)
276
+ t2 = a.token_stream("fieldname", input)
277
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
278
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
279
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
280
+ assert_equal(Token.new('mail', 27, 31), t.next)
281
+ assert_equal(Token.new('address', 40, 47), t.next)
282
+ assert_equal(Token.new('23', 49, 51), t.next)
283
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
284
+ assert_equal(Token.new('tnt', 86, 91), t.next)
285
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
286
+ assert_equal(Token.new('23', 111, 113), t.next)
287
+ assert_equal(Token.new('áägç', 117, 124), t.next)
288
+ assert_equal(Token.new('êëì', 126, 132), t.next)
289
+ assert_equal(Token.new('úøã', 134, 140), t.next)
290
+ assert_equal(Token.new('öîí', 142, 148), t.next)
291
+ assert(! t.next())
292
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t2.next)
293
+ assert_equal(Token.new('email', 25, 31, 3), t2.next)
294
+ assert_equal(Token.new('e', 25, 26, 0), t2.next)
295
+ assert_equal(Token.new('mail', 27, 31), t2.next)
296
+ assert_equal(Token.new('address', 40, 47), t2.next)
297
+ assert_equal(Token.new('23', 49, 51), t2.next)
298
+ assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
299
+ assert_equal(Token.new('tnt', 86, 91), t2.next)
300
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
301
+ assert_equal(Token.new('23', 111, 113), t2.next)
302
+ assert_equal(Token.new('áägç', 117, 124), t2.next)
303
+ assert_equal(Token.new('êëì', 126, 132), t2.next)
304
+ assert_equal(Token.new('úøã', 134, 140), t2.next)
305
+ assert_equal(Token.new('öîí', 142, 148), t2.next)
306
+ assert(! t2.next())
307
+ a = StandardAnalyzer.new(nil, false)
308
+ t = a.token_stream("fieldname", input)
309
+ assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
310
+ assert_equal(Token.new('My', 22, 24), t.next)
311
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
312
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
313
+ assert_equal(Token.new('mail', 27, 31), t.next)
314
+ assert_equal(Token.new('Address', 40, 47), t.next)
315
+ assert_equal(Token.new('23', 49, 51), t.next)
316
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
317
+ assert_equal(Token.new('TNT', 86, 91), t.next)
318
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
319
+ assert_equal(Token.new('23', 111, 113), t.next)
320
+ assert_equal(Token.new('ÁÄGÇ', 117, 124), t.next)
321
+ assert_equal(Token.new('ÊËÌ', 126, 132), t.next)
322
+ assert_equal(Token.new('ÚØÃ', 134, 140), t.next)
323
+ assert_equal(Token.new('ÖÎÍ', 142, 148), t.next)
324
+ assert(! t.next())
325
+ a = StandardAnalyzer.new(["e-mail", "23", "tnt"])
326
+ t = a.token_stream("fieldname", input)
327
+ t2 = a.token_stream("fieldname", input)
328
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
329
+ assert_equal(Token.new('is', 19, 21), t.next)
330
+ assert_equal(Token.new('my', 22, 24), t.next)
331
+ assert_equal(Token.new('and', 32, 35), t.next)
332
+ assert_equal(Token.new('the', 36, 39), t.next)
333
+ assert_equal(Token.new('address', 40, 47), t.next)
334
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
335
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
336
+ assert_equal(Token.new('áägç', 117, 124), t.next)
337
+ assert_equal(Token.new('êëì', 126, 132), t.next)
338
+ assert_equal(Token.new('úøã', 134, 140), t.next)
339
+ assert_equal(Token.new('öîí', 142, 148), t.next)
340
+ assert(! t.next())
341
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t2.next)
342
+ assert_equal(Token.new('is', 19, 21), t2.next)
343
+ assert_equal(Token.new('my', 22, 24), t2.next)
344
+ assert_equal(Token.new('and', 32, 35), t2.next)
345
+ assert_equal(Token.new('the', 36, 39), t2.next)
346
+ assert_equal(Token.new('address', 40, 47), t2.next)
347
+ assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
348
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
349
+ assert_equal(Token.new('áägç', 117, 124), t2.next)
350
+ assert_equal(Token.new('êëì', 126, 132), t2.next)
351
+ assert_equal(Token.new('úøã', 134, 140), t2.next)
352
+ assert_equal(Token.new('öîí', 142, 148), t2.next)
353
+ assert(! t2.next())
354
+ end
355
+ end if (/utf-8/i =~ Ferret.locale)
356
+
357
+ class PerFieldAnalyzerTest < Test::Unit::TestCase
358
+ include Ferret::Analysis
359
+ def test_per_field_analyzer()
360
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$'
361
+ pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
362
+ pfa['white'] = WhiteSpaceAnalyzer.new(false)
363
+ pfa['white_l'] = WhiteSpaceAnalyzer.new(true)
364
+ pfa['letter'] = LetterAnalyzer.new(false)
365
+ pfa.add_field('letter', LetterAnalyzer.new(true))
366
+ pfa.add_field('letter_u', LetterAnalyzer.new(false))
367
+ t = pfa.token_stream('white', input)
368
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
369
+ assert_equal(Token.new('is', 19, 21), t.next)
370
+ assert_equal(Token.new('My', 22, 24), t.next)
371
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
372
+ assert_equal(Token.new('52', 32, 34), t.next)
373
+ assert_equal(Token.new('#$', 37, 39), t.next)
374
+ assert_equal(Token.new('address.', 40, 48), t.next)
375
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
376
+ assert(! t.next())
377
+ t = pfa.token_stream('white_l', input)
378
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
379
+ assert_equal(Token.new('is', 19, 21), t.next)
380
+ assert_equal(Token.new('my', 22, 24), t.next)
381
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
382
+ assert_equal(Token.new('52', 32, 34), t.next)
383
+ assert_equal(Token.new('#$', 37, 39), t.next)
384
+ assert_equal(Token.new('address.', 40, 48), t.next)
385
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
386
+ assert(! t.next())
387
+ t = pfa.token_stream('letter_u', input)
388
+ assert_equal(Token.new('DBalmain', 0, 8), t.next)
389
+ assert_equal(Token.new('gmail', 9, 14), t.next)
390
+ assert_equal(Token.new('com', 15, 18), t.next)
391
+ assert_equal(Token.new('is', 19, 21), t.next)
392
+ assert_equal(Token.new('My', 22, 24), t.next)
393
+ assert_equal(Token.new('e', 25, 26), t.next)
394
+ assert_equal(Token.new('mail', 27, 31), t.next)
395
+ assert_equal(Token.new('address', 40, 47), t.next)
396
+ assert(! t.next())
397
+ t = pfa.token_stream('letter', input)
398
+ assert_equal(Token.new('dbalmain', 0, 8), t.next)
399
+ assert_equal(Token.new('gmail', 9, 14), t.next)
400
+ assert_equal(Token.new('com', 15, 18), t.next)
401
+ assert_equal(Token.new('is', 19, 21), t.next)
402
+ assert_equal(Token.new('my', 22, 24), t.next)
403
+ assert_equal(Token.new('e', 25, 26), t.next)
404
+ assert_equal(Token.new('mail', 27, 31), t.next)
405
+ assert_equal(Token.new('address', 40, 47), t.next)
406
+ assert(! t.next())
407
+ t = pfa.token_stream('XXX', input) # should use default StandardAnalzyer
408
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
409
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
410
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
411
+ assert_equal(Token.new('mail', 27, 31), t.next)
412
+ assert_equal(Token.new('52', 32, 34), t.next)
413
+ assert_equal(Token.new('address', 40, 47), t.next)
414
+ assert_equal(Token.new('23', 49, 51), t.next)
415
+ assert(! t.next())
416
+ end
417
+ end
418
+
419
+ class RegExpAnalyzerTest < Test::Unit::TestCase
420
+ include Ferret::Analysis
421
+
422
+ def test_reg_exp_analyzer()
423
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
424
+ a = RegExpAnalyzer.new()
425
+ t = a.token_stream('XXX', input)
426
+ t2 = a.token_stream('XXX', "one_Two three")
427
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
428
+ assert_equal(Token.new('is', 19, 21), t.next)
429
+ assert_equal(Token.new('my', 22, 24), t.next)
430
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
431
+ assert_equal(Token.new('52', 32, 34), t.next)
432
+ assert_equal(Token.new('address', 40, 47), t.next)
433
+ assert_equal(Token.new('23', 49, 51), t.next)
434
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
435
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
436
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
437
+ assert_equal(Token.new('23', 116, 118), t.next)
438
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
439
+ assert(! t.next())
440
+ t = t2
441
+ assert_equal(Token.new("one_two", 0, 7), t.next())
442
+ assert_equal(Token.new("three", 8, 13), t.next())
443
+ assert(! t.next())
444
+ a = RegExpAnalyzer.new(/\w{2,}/, false)
445
+ t = a.token_stream('XXX', input)
446
+ t2 = a.token_stream('XXX', "one Two three")
447
+ assert_equal(Token.new('DBalmain', 0, 8), t.next)
448
+ assert_equal(Token.new('gmail', 9, 14), t.next)
449
+ assert_equal(Token.new('com', 15, 18), t.next)
450
+ assert_equal(Token.new('is', 19, 21), t.next)
451
+ assert_equal(Token.new('My', 22, 24), t.next)
452
+ assert_equal(Token.new('mail', 27, 31), t.next)
453
+ assert_equal(Token.new('52', 32, 34), t.next)
454
+ assert_equal(Token.new('Address', 40, 47), t.next)
455
+ assert_equal(Token.new('23', 49, 51), t.next)
456
+ assert_equal(Token.new('http', 55, 59), t.next)
457
+ assert_equal(Token.new('www', 62, 65), t.next)
458
+ assert_equal(Token.new('google', 66, 72), t.next)
459
+ assert_equal(Token.new('com', 73, 76), t.next)
460
+ assert_equal(Token.new('RESULT_3', 77, 85), t.next)
461
+ assert_equal(Token.new('html', 86, 90), t.next)
462
+ assert_equal(Token.new('123', 98, 101), t.next)
463
+ assert_equal(Token.new('1235', 102, 106), t.next)
464
+ assert_equal(Token.new('ASD', 107, 110), t.next)
465
+ assert_equal(Token.new('1234', 111, 115), t.next)
466
+ assert_equal(Token.new('23', 116, 118), t.next)
467
+ assert_equal(Token.new('Rob', 119, 122), t.next)
468
+ assert(! t.next())
469
+ assert_equal(Token.new("one", 0, 3), t2.next())
470
+ assert_equal(Token.new("Two", 4, 7), t2.next())
471
+ assert_equal(Token.new("three", 8, 13), t2.next())
472
+ assert(! t2.next())
473
+ a = RegExpAnalyzer.new() do |str|
474
+ if str =~ /^[[:alpha:]]\.([[:alpha:]]\.)+$/
475
+ str.gsub!(/\./, '')
476
+ elsif str =~ /'[sS]$/
477
+ str.gsub!(/'[sS]$/, '')
478
+ end
479
+ str
480
+ end
481
+ t = a.token_stream('XXX', input)
482
+ t2 = a.token_stream('XXX', "one's don't T.N.T.")
483
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
484
+ assert_equal(Token.new('is', 19, 21), t.next)
485
+ assert_equal(Token.new('my', 22, 24), t.next)
486
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
487
+ assert_equal(Token.new('52', 32, 34), t.next)
488
+ assert_equal(Token.new('address', 40, 47), t.next)
489
+ assert_equal(Token.new('23', 49, 51), t.next)
490
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
491
+ assert_equal(Token.new('tnt', 91, 97), t.next)
492
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
493
+ assert_equal(Token.new('23', 116, 118), t.next)
494
+ assert_equal(Token.new('rob', 119, 124), t.next)
495
+ assert(! t.next())
496
+ assert_equal(Token.new("one", 0, 5), t2.next())
497
+ assert_equal(Token.new("don't", 6, 11), t2.next())
498
+ assert_equal(Token.new("tnt", 12, 18), t2.next())
499
+ assert(! t2.next())
500
+ end
501
+ end
502
+
503
+ module Ferret::Analysis
504
+ class StemmingStandardAnalyzer < StandardAnalyzer
505
+ def token_stream(field, text)
506
+ StemFilter.new(super)
507
+ end
508
+ end
509
+ end
510
+
511
+ class CustomAnalyzerTest < Test::Unit::TestCase
512
+ include Ferret::Analysis
513
+
514
+ def test_custom_filter()
515
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
516
+ a = StemmingStandardAnalyzer.new()
517
+ t = a.token_stream("fieldname", input)
518
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
519
+ assert_equal(Token.new('email', 25, 31, 3), t.next)
520
+ assert_equal(Token.new('e', 25, 26, 0), t.next)
521
+ assert_equal(Token.new('mail', 27, 31), t.next)
522
+ assert_equal(Token.new('address', 40, 47), t.next)
523
+ assert_equal(Token.new('23', 49, 51), t.next)
524
+ assert_equal(Token.new('www.google.com/result', 55, 85), t.next)
525
+ assert_equal(Token.new('tnt', 86, 91), t.next)
526
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
527
+ assert_equal(Token.new('23', 111, 113), t.next)
528
+ assert_equal(Token.new('áägç', 117, 124), t.next)
529
+ assert_equal(Token.new('êëì', 126, 132), t.next)
530
+ assert_equal(Token.new('úøã', 134, 140), t.next)
531
+ assert_equal(Token.new('öîí', 142, 148), t.next)
532
+ assert(! t.next())
533
+ input = "Debate Debates DEBATED DEBating Debater";
534
+ t = a.token_stream("fieldname", input)
535
+ assert_equal(Token.new("debat", 0, 6), t.next)
536
+ assert_equal(Token.new("debat", 7, 14), t.next)
537
+ assert_equal(Token.new("debat", 15, 22), t.next)
538
+ assert_equal(Token.new("debat", 23, 31), t.next)
539
+ assert_equal(Token.new("debat", 32, 39), t.next)
540
+ assert(! t.next())
541
+ input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
542
+ t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
543
+ assert_equal(Token.new("dêbate", 0, 7), t.next)
544
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
545
+ assert_equal(Token.new("dêbate", 17, 25), t.next)
546
+ assert_equal(Token.new("dêbate", 26, 35), t.next)
547
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
548
+ assert(! t.next())
549
+ end
550
+ end if (/utf-8/i =~ Ferret.locale)