ferret 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (187) hide show
  1. data/Rakefile +23 -5
  2. data/TODO +2 -1
  3. data/ext/analysis.c +838 -177
  4. data/ext/analysis.h +55 -7
  5. data/ext/api.c +69 -0
  6. data/ext/api.h +27 -0
  7. data/ext/array.c +8 -5
  8. data/ext/compound_io.c +132 -96
  9. data/ext/document.c +58 -28
  10. data/ext/except.c +59 -0
  11. data/ext/except.h +88 -0
  12. data/ext/ferret.c +47 -3
  13. data/ext/ferret.h +3 -0
  14. data/ext/field.c +15 -9
  15. data/ext/filter.c +1 -1
  16. data/ext/fs_store.c +215 -34
  17. data/ext/global.c +72 -3
  18. data/ext/global.h +4 -3
  19. data/ext/hash.c +44 -3
  20. data/ext/hash.h +9 -0
  21. data/ext/header.h +58 -0
  22. data/ext/inc/except.h +88 -0
  23. data/ext/inc/lang.h +23 -13
  24. data/ext/ind.c +16 -10
  25. data/ext/index.h +2 -22
  26. data/ext/index_io.c +3 -11
  27. data/ext/index_rw.c +245 -193
  28. data/ext/lang.h +23 -13
  29. data/ext/libstemmer.c +92 -0
  30. data/ext/libstemmer.h +79 -0
  31. data/ext/modules.h +162 -0
  32. data/ext/q_boolean.c +34 -21
  33. data/ext/q_const_score.c +6 -12
  34. data/ext/q_filtered_query.c +206 -0
  35. data/ext/q_fuzzy.c +18 -15
  36. data/ext/q_match_all.c +3 -7
  37. data/ext/q_multi_phrase.c +10 -14
  38. data/ext/q_parser.c +29 -2
  39. data/ext/q_phrase.c +14 -21
  40. data/ext/q_prefix.c +15 -12
  41. data/ext/q_range.c +30 -28
  42. data/ext/q_span.c +13 -21
  43. data/ext/q_term.c +17 -26
  44. data/ext/r_analysis.c +693 -21
  45. data/ext/r_doc.c +11 -12
  46. data/ext/r_index_io.c +4 -1
  47. data/ext/r_qparser.c +21 -2
  48. data/ext/r_search.c +285 -18
  49. data/ext/ram_store.c +5 -2
  50. data/ext/search.c +11 -17
  51. data/ext/search.h +21 -45
  52. data/ext/similarity.h +67 -0
  53. data/ext/sort.c +30 -25
  54. data/ext/stem_ISO_8859_1_danish.c +338 -0
  55. data/ext/stem_ISO_8859_1_danish.h +16 -0
  56. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  57. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  58. data/ext/stem_ISO_8859_1_english.c +1156 -0
  59. data/ext/stem_ISO_8859_1_english.h +16 -0
  60. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  61. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  62. data/ext/stem_ISO_8859_1_french.c +1276 -0
  63. data/ext/stem_ISO_8859_1_french.h +16 -0
  64. data/ext/stem_ISO_8859_1_german.c +512 -0
  65. data/ext/stem_ISO_8859_1_german.h +16 -0
  66. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  67. data/ext/stem_ISO_8859_1_italian.h +16 -0
  68. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  69. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  70. data/ext/stem_ISO_8859_1_porter.c +776 -0
  71. data/ext/stem_ISO_8859_1_porter.h +16 -0
  72. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  73. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  74. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  75. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  76. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  77. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  78. data/ext/stem_KOI8_R_russian.c +701 -0
  79. data/ext/stem_KOI8_R_russian.h +16 -0
  80. data/ext/stem_UTF_8_danish.c +344 -0
  81. data/ext/stem_UTF_8_danish.h +16 -0
  82. data/ext/stem_UTF_8_dutch.c +653 -0
  83. data/ext/stem_UTF_8_dutch.h +16 -0
  84. data/ext/stem_UTF_8_english.c +1176 -0
  85. data/ext/stem_UTF_8_english.h +16 -0
  86. data/ext/stem_UTF_8_finnish.c +808 -0
  87. data/ext/stem_UTF_8_finnish.h +16 -0
  88. data/ext/stem_UTF_8_french.c +1296 -0
  89. data/ext/stem_UTF_8_french.h +16 -0
  90. data/ext/stem_UTF_8_german.c +526 -0
  91. data/ext/stem_UTF_8_german.h +16 -0
  92. data/ext/stem_UTF_8_italian.c +1113 -0
  93. data/ext/stem_UTF_8_italian.h +16 -0
  94. data/ext/stem_UTF_8_norwegian.c +302 -0
  95. data/ext/stem_UTF_8_norwegian.h +16 -0
  96. data/ext/stem_UTF_8_porter.c +794 -0
  97. data/ext/stem_UTF_8_porter.h +16 -0
  98. data/ext/stem_UTF_8_portuguese.c +1055 -0
  99. data/ext/stem_UTF_8_portuguese.h +16 -0
  100. data/ext/stem_UTF_8_russian.c +709 -0
  101. data/ext/stem_UTF_8_russian.h +16 -0
  102. data/ext/stem_UTF_8_spanish.c +1137 -0
  103. data/ext/stem_UTF_8_spanish.h +16 -0
  104. data/ext/stem_UTF_8_swedish.c +313 -0
  105. data/ext/stem_UTF_8_swedish.h +16 -0
  106. data/ext/stopwords.c +325 -0
  107. data/ext/store.c +34 -2
  108. data/ext/tags +2953 -0
  109. data/ext/term.c +21 -15
  110. data/ext/termdocs.c +5 -3
  111. data/ext/utilities.c +446 -0
  112. data/ext/vector.c +27 -13
  113. data/lib/ferret/document/document.rb +1 -1
  114. data/lib/ferret/index/index.rb +44 -6
  115. data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
  116. data/lib/rferret.rb +2 -1
  117. data/test/test_helper.rb +2 -2
  118. data/test/unit/analysis/ctc_analyzer.rb +401 -0
  119. data/test/unit/analysis/ctc_tokenstream.rb +423 -0
  120. data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
  121. data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
  122. data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
  123. data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
  124. data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
  125. data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
  126. data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
  127. data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
  128. data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
  129. data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
  130. data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
  131. data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
  132. data/test/unit/analysis/tc_analyzer.rb +1 -2
  133. data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
  134. data/test/unit/document/rtc_field.rb +28 -0
  135. data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
  136. data/test/unit/document/tc_field.rb +82 -12
  137. data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
  138. data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
  139. data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
  140. data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
  141. data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
  142. data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
  143. data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
  144. data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
  145. data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
  146. data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
  147. data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
  148. data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
  149. data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
  150. data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
  151. data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
  152. data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
  153. data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
  154. data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
  155. data/test/unit/query_parser/tc_query_parser.rb +24 -16
  156. data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
  157. data/test/unit/search/rtc_sort_field.rb +14 -0
  158. data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
  159. data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
  160. data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
  161. data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
  162. data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
  163. data/test/unit/search/tc_sort_field.rb +20 -7
  164. data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
  165. data/test/unit/store/rtc_fs_store.rb +62 -0
  166. data/test/unit/store/rtc_ram_store.rb +15 -0
  167. data/test/unit/store/rtm_store.rb +150 -0
  168. data/test/unit/store/rtm_store_lock.rb +2 -0
  169. data/test/unit/store/tc_fs_store.rb +54 -40
  170. data/test/unit/store/tc_ram_store.rb +20 -0
  171. data/test/unit/store/tm_store.rb +30 -146
  172. data/test/unit/store/tm_store_lock.rb +66 -0
  173. data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
  174. data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
  175. data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
  176. data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
  177. data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
  178. data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
  179. data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
  180. data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
  181. metadata +360 -289
  182. data/test/unit/document/c_field.rb +0 -98
  183. data/test/unit/search/c_sort_field.rb +0 -27
  184. data/test/unit/store/c_fs_store.rb +0 -76
  185. data/test/unit/store/c_ram_store.rb +0 -35
  186. data/test/unit/store/m_store.rb +0 -34
  187. data/test/unit/store/m_store_lock.rb +0 -68
@@ -0,0 +1,423 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class AsciiLetterTokenizerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+
6
+ def test_letter_tokenizer()
7
+ input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#@$'
8
+ t = AsciiLetterTokenizer.new(input)
9
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
10
+ assert_equal(Token.new("gmail", 9, 14), t.next())
11
+ assert_equal(Token.new("com", 15, 18), t.next())
12
+ assert_equal(Token.new("is", 19, 21), t.next())
13
+ assert_equal(Token.new("My", 22, 24), t.next())
14
+ assert_equal(Token.new("e", 25, 26), t.next())
15
+ assert_equal(Token.new("mail", 27, 31), t.next())
16
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
17
+ assert(! t.next())
18
+ t.text = "one_two three"
19
+ assert_equal(Token.new("one", 0, 3), t.next())
20
+ assert_equal(Token.new("two", 4, 7), t.next())
21
+ assert_equal(Token.new("three", 8, 13), t.next())
22
+ assert(! t.next())
23
+ t = AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(input))
24
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
25
+ assert_equal(Token.new("gmail", 9, 14), t.next())
26
+ assert_equal(Token.new("com", 15, 18), t.next())
27
+ assert_equal(Token.new("is", 19, 21), t.next())
28
+ assert_equal(Token.new("my", 22, 24), t.next())
29
+ assert_equal(Token.new("e", 25, 26), t.next())
30
+ assert_equal(Token.new("mail", 27, 31), t.next())
31
+ assert_equal(Token.new("address", 39, 46), t.next())
32
+ assert(! t.next())
33
+ end
34
+ end
35
+
36
+ class LetterTokenizerTest < Test::Unit::TestCase
37
+ include Ferret::Analysis
38
+
39
+ def test_letter_tokenizer()
40
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
41
+ t = LetterTokenizer.new(input)
42
+ assert_equal(Token.new('DBalmän', 0, 8), t.next)
43
+ assert_equal(Token.new('gmail', 9, 14), t.next)
44
+ assert_equal(Token.new('com', 15, 18), t.next)
45
+ assert_equal(Token.new('is', 19, 21), t.next)
46
+ assert_equal(Token.new('My', 22, 24), t.next)
47
+ assert_equal(Token.new('e', 25, 26), t.next)
48
+ assert_equal(Token.new('mail', 27, 31), t.next)
49
+ assert_equal(Token.new('address', 40, 47), t.next)
50
+ assert_equal(Token.new('ÁÄGÇ', 55, 62), t.next)
51
+ assert_equal(Token.new('ÊËÌ', 64, 70), t.next)
52
+ assert_equal(Token.new('ÚØÃ', 72, 78), t.next)
53
+ assert_equal(Token.new('ÖÎÍ', 80, 86), t.next)
54
+ assert(! t.next())
55
+ t.text = "one_two three"
56
+ assert_equal(Token.new("one", 0, 3), t.next())
57
+ assert_equal(Token.new("two", 4, 7), t.next())
58
+ assert_equal(Token.new("three", 8, 13), t.next())
59
+ assert(! t.next())
60
+ t = LowerCaseFilter.new(LetterTokenizer.new(input))
61
+ assert_equal(Token.new('dbalmän', 0, 8), t.next)
62
+ assert_equal(Token.new('gmail', 9, 14), t.next)
63
+ assert_equal(Token.new('com', 15, 18), t.next)
64
+ assert_equal(Token.new('is', 19, 21), t.next)
65
+ assert_equal(Token.new('my', 22, 24), t.next)
66
+ assert_equal(Token.new('e', 25, 26), t.next)
67
+ assert_equal(Token.new('mail', 27, 31), t.next)
68
+ assert_equal(Token.new('address', 40, 47), t.next)
69
+ assert_equal(Token.new('áägç', 55, 62), t.next)
70
+ assert_equal(Token.new('êëì', 64, 70), t.next)
71
+ assert_equal(Token.new('úøã', 72, 78), t.next)
72
+ assert_equal(Token.new('öîí', 80, 86), t.next)
73
+ assert(! t.next())
74
+ t = LetterTokenizer.new(input, true)
75
+ assert_equal(Token.new('dbalmän', 0, 8), t.next)
76
+ assert_equal(Token.new('gmail', 9, 14), t.next)
77
+ assert_equal(Token.new('com', 15, 18), t.next)
78
+ assert_equal(Token.new('is', 19, 21), t.next)
79
+ assert_equal(Token.new('my', 22, 24), t.next)
80
+ assert_equal(Token.new('e', 25, 26), t.next)
81
+ assert_equal(Token.new('mail', 27, 31), t.next)
82
+ assert_equal(Token.new('address', 40, 47), t.next)
83
+ assert_equal(Token.new('áägç', 55, 62), t.next)
84
+ assert_equal(Token.new('êëì', 64, 70), t.next)
85
+ assert_equal(Token.new('úøã', 72, 78), t.next)
86
+ assert_equal(Token.new('öîí', 80, 86), t.next)
87
+ assert(! t.next())
88
+ end
89
+ end
90
+
91
+ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
92
+ include Ferret::Analysis
93
+
94
+ def test_whitespace_tokenizer()
95
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ ADDRESS. 23#@$'
96
+ t = AsciiWhiteSpaceTokenizer.new(input)
97
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
98
+ assert_equal(Token.new('is', 19, 21), t.next)
99
+ assert_equal(Token.new('My', 22, 24), t.next)
100
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
101
+ assert_equal(Token.new('52', 32, 34), t.next)
102
+ assert_equal(Token.new('#$', 37, 39), t.next)
103
+ assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
104
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
105
+ assert(! t.next())
106
+ t.text = "one_two three"
107
+ assert_equal(Token.new("one_two", 0, 7), t.next())
108
+ assert_equal(Token.new("three", 8, 13), t.next())
109
+ assert(! t.next())
110
+ t = AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(input))
111
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
112
+ assert_equal(Token.new('is', 19, 21), t.next)
113
+ assert_equal(Token.new('my', 22, 24), t.next)
114
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
115
+ assert_equal(Token.new('52', 32, 34), t.next)
116
+ assert_equal(Token.new('#$', 37, 39), t.next)
117
+ assert_equal(Token.new('address.', 40, 48), t.next)
118
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
119
+ assert(! t.next())
120
+ end
121
+ end
122
+
123
+ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
124
+ include Ferret::Analysis
125
+
126
+ def test_whitespace_tokenizer()
127
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
128
+ t = WhiteSpaceTokenizer.new(input)
129
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
130
+ assert_equal(Token.new('is', 19, 21), t.next)
131
+ assert_equal(Token.new('My', 22, 24), t.next)
132
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
133
+ assert_equal(Token.new('52', 32, 34), t.next)
134
+ assert_equal(Token.new('#$', 37, 39), t.next)
135
+ assert_equal(Token.new('address.', 40, 48), t.next)
136
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
137
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
138
+ assert(! t.next())
139
+ t.text = "one_two three"
140
+ assert_equal(Token.new("one_two", 0, 7), t.next())
141
+ assert_equal(Token.new("three", 8, 13), t.next())
142
+ assert(! t.next())
143
+ t = LowerCaseFilter.new(WhiteSpaceTokenizer.new(input))
144
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
145
+ assert_equal(Token.new('is', 19, 21), t.next)
146
+ assert_equal(Token.new('my', 22, 24), t.next)
147
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
148
+ assert_equal(Token.new('52', 32, 34), t.next)
149
+ assert_equal(Token.new('#$', 37, 39), t.next)
150
+ assert_equal(Token.new('address.', 40, 48), t.next)
151
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
152
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
153
+ assert(! t.next())
154
+ t = WhiteSpaceTokenizer.new(input, true)
155
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
156
+ assert_equal(Token.new('is', 19, 21), t.next)
157
+ assert_equal(Token.new('my', 22, 24), t.next)
158
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
159
+ assert_equal(Token.new('52', 32, 34), t.next)
160
+ assert_equal(Token.new('#$', 37, 39), t.next)
161
+ assert_equal(Token.new('address.', 40, 48), t.next)
162
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
163
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
164
+ assert(! t.next())
165
+ end
166
+ end
167
+
168
+ class AsciiStandardTokenizerTest < Test::Unit::TestCase
169
+ include Ferret::Analysis
170
+
171
+ def test_standard_tokenizer()
172
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
173
+ t = AsciiStandardTokenizer.new(input)
174
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
175
+ assert_equal(Token.new('is', 19, 21), t.next)
176
+ assert_equal(Token.new('My', 22, 24), t.next)
177
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
178
+ assert_equal(Token.new('52', 32, 34), t.next)
179
+ assert_equal(Token.new('Address', 40, 47), t.next)
180
+ assert_equal(Token.new('23', 49, 51), t.next)
181
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
182
+ assert_equal(Token.new('TNT', 86, 91), t.next)
183
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
184
+ assert(! t.next())
185
+ t.text = "one_two three"
186
+ assert_equal(Token.new("one_two", 0, 7), t.next())
187
+ assert_equal(Token.new("three", 8, 13), t.next())
188
+ assert(! t.next())
189
+ t = AsciiLowerCaseFilter.new(AsciiStandardTokenizer.new(input))
190
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
191
+ assert_equal(Token.new('is', 19, 21), t.next)
192
+ assert_equal(Token.new('my', 22, 24), t.next)
193
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
194
+ assert_equal(Token.new('52', 32, 34), t.next)
195
+ assert_equal(Token.new('address', 40, 47), t.next)
196
+ assert_equal(Token.new('23', 49, 51), t.next)
197
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
198
+ assert_equal(Token.new('tnt', 86, 91), t.next)
199
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
200
+ assert(! t.next())
201
+ end
202
+ end
203
+
204
+ class StandardTokenizerTest < Test::Unit::TestCase
205
+ include Ferret::Analysis
206
+
207
+ def test_standard_tokenizer()
208
+ input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
209
+ t = StandardTokenizer.new(input)
210
+ assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
211
+ assert_equal(Token.new('is', 19, 21), t.next)
212
+ assert_equal(Token.new('My', 22, 24), t.next)
213
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
214
+ assert_equal(Token.new('52', 32, 34), t.next)
215
+ assert_equal(Token.new('Address', 40, 47), t.next)
216
+ assert_equal(Token.new('23', 49, 51), t.next)
217
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
218
+ assert_equal(Token.new('TNT', 86, 91), t.next)
219
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
220
+ assert_equal(Token.new('23', 111, 113), t.next)
221
+ assert_equal(Token.new('ÁÄGÇ', 117, 124), t.next)
222
+ assert_equal(Token.new('ÊËÌ', 126, 132), t.next)
223
+ assert_equal(Token.new('ÚØÃ', 134, 140), t.next)
224
+ assert_equal(Token.new('ÖÎÍ', 142, 148), t.next)
225
+ assert(! t.next())
226
+ t.text = "one_two three"
227
+ assert_equal(Token.new("one_two", 0, 7), t.next())
228
+ assert_equal(Token.new("three", 8, 13), t.next())
229
+ assert(! t.next())
230
+ t = LowerCaseFilter.new(StandardTokenizer.new(input))
231
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
232
+ assert_equal(Token.new('is', 19, 21), t.next)
233
+ assert_equal(Token.new('my', 22, 24), t.next)
234
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
235
+ assert_equal(Token.new('52', 32, 34), t.next)
236
+ assert_equal(Token.new('address', 40, 47), t.next)
237
+ assert_equal(Token.new('23', 49, 51), t.next)
238
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
239
+ assert_equal(Token.new('tnt', 86, 91), t.next)
240
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
241
+ assert_equal(Token.new('23', 111, 113), t.next)
242
+ assert_equal(Token.new('áägç', 117, 124), t.next)
243
+ assert_equal(Token.new('êëì', 126, 132), t.next)
244
+ assert_equal(Token.new('úøã', 134, 140), t.next)
245
+ assert_equal(Token.new('öîí', 142, 148), t.next)
246
+ assert(! t.next())
247
+ end
248
+ end
249
+
250
+ class StopFilterTest < Test::Unit::TestCase
251
+ include Ferret::Analysis
252
+
253
+ def test_stop_filter()
254
+ words = ["one", "four", "five", "seven"]
255
+ input = "one, two, three, four, five, six, seven, eight, nine, ten."
256
+ t = StopFilter.new(AsciiLetterTokenizer.new(input), words)
257
+ assert_equal(Token.new('two', 5, 8, 2), t.next)
258
+ assert_equal(Token.new('three', 10, 15, 1), t.next)
259
+ assert_equal(Token.new('six', 29, 32, 3), t.next)
260
+ assert_equal(Token.new('eight', 41, 46, 2), t.next)
261
+ assert_equal(Token.new('nine', 48, 52, 1), t.next)
262
+ assert_equal(Token.new('ten', 54, 57, 1), t.next)
263
+ assert(! t.next())
264
+ end
265
+ end
266
+
267
+ class StemFilterTest < Test::Unit::TestCase
268
+ include Ferret::Analysis
269
+
270
+ def test_stop_filter()
271
+ input = "Debate Debates DEBATED DEBating Debater";
272
+ t = StemFilter.new(AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(input)),
273
+ "english")
274
+ assert_equal(Token.new("debat", 0, 6), t.next)
275
+ assert_equal(Token.new("debat", 7, 14), t.next)
276
+ assert_equal(Token.new("debat", 15, 22), t.next)
277
+ assert_equal(Token.new("debat", 23, 31), t.next)
278
+ assert_equal(Token.new("debat", 32, 39), t.next)
279
+ assert(! t.next())
280
+ t = StemFilter.new(AsciiLetterTokenizer.new(input), :english)
281
+ assert_equal(Token.new("Debat", 0, 6), t.next)
282
+ assert_equal(Token.new("Debat", 7, 14), t.next)
283
+ assert_equal(Token.new("DEBATED", 15, 22), t.next)
284
+ assert_equal(Token.new("DEBate", 23, 31), t.next)
285
+ assert_equal(Token.new("Debat", 32, 39), t.next)
286
+
287
+ input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
288
+ t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
289
+ assert_equal(Token.new("dêbate", 0, 7), t.next)
290
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
291
+ assert_equal(Token.new("dêbate", 17, 25), t.next)
292
+ assert_equal(Token.new("dêbate", 26, 35), t.next)
293
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
294
+ t = StemFilter.new(LetterTokenizer.new(input), :english)
295
+ assert_equal(Token.new("Dêbate", 0, 7), t.next)
296
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
297
+ assert_equal(Token.new("DÊBATED", 17, 25), t.next)
298
+ assert_equal(Token.new("DÊBATing", 26, 35), t.next)
299
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
300
+ assert(! t.next())
301
+ end
302
+ end
303
+
304
+ require 'strscan'
305
+ module Ferret::Analysis
306
+
307
+ class MyRegExpTokenizer < TokenStream
308
+
309
+ def initialize(input)
310
+ @ss = StringScanner.new(input)
311
+ end
312
+
313
+ # Returns the next token in the stream, or null at EOS.
314
+ def next()
315
+ if @ss.scan_until(token_re)
316
+ term = @ss.matched
317
+ term_end = @ss.pos
318
+ term_start = term_end - term.size
319
+ else
320
+ return nil
321
+ end
322
+
323
+ return Token.new(normalize(term), term_start, term_end)
324
+ end
325
+
326
+ protected
327
+ # returns the regular expression used to find the next token
328
+ TOKEN_RE = /[[:alpha:]]+/
329
+ def token_re
330
+ TOKEN_RE
331
+ end
332
+
333
+ # Called on each token to normalize it before it is added to the
334
+ # token. The default implementation does nothing. Subclasses may
335
+ # use this to, e.g., lowercase tokens.
336
+ def normalize(str) return str end
337
+ end
338
+
339
+ class MyCSVTokenizer < MyRegExpTokenizer
340
+ protected
341
+ # returns the regular expression used to find the next token
342
+ TOKEN_RE = /[^,]+/
343
+ def token_re
344
+ TOKEN_RE
345
+ end
346
+
347
+ # Called on each token to normalize it before it is added to the
348
+ # token. The default implementation does nothing. Subclasses may
349
+ # use this to, e.g., lowercase tokens.
350
+ def normalize(str) return str.upcase end
351
+ end
352
+ end
353
+
354
+ class CustomTokenizerTest < Test::Unit::TestCase
355
+ include Ferret::Analysis
356
+
357
+ def test_custom_tokenizer()
358
+ input = "First Field,2nd Field, P a d d e d F i e l d "
359
+ t = MyCSVTokenizer.new(input)
360
+ assert_equal(Token.new("FIRST FIELD", 0, 11), t.next)
361
+ assert_equal(Token.new("2ND FIELD", 12, 21), t.next)
362
+ assert_equal(Token.new(" P A D D E D F I E L D ", 22, 48), t.next)
363
+ assert(! t.next())
364
+ t = AsciiLowerCaseFilter.new(MyCSVTokenizer.new(input))
365
+ assert_equal(Token.new("first field", 0, 11), t.next)
366
+ assert_equal(Token.new("2nd field", 12, 21), t.next)
367
+ assert_equal(Token.new(" p a d d e d f i e l d ", 22, 48), t.next)
368
+ assert(! t.next())
369
+ end
370
+ end
371
+
372
+ module Ferret::Analysis
373
+ class TokenFilter < TokenStream
374
+ protected
375
+ # Construct a token stream filtering the given input.
376
+ def initialize(input)
377
+ @input = input
378
+ end
379
+ end
380
+
381
+ # Normalizes token text to lower case.
382
+ class CapitalizeFilter < TokenFilter
383
+ def next()
384
+ t = @input.next()
385
+
386
+ if (t == nil)
387
+ return nil
388
+ end
389
+
390
+ t.text = t.text[0,1].upcase + t.text[1..-1]
391
+
392
+ return t
393
+ end
394
+ end
395
+ end
396
+
397
+ class CustomFilterTest < Test::Unit::TestCase
398
+ include Ferret::Analysis
399
+
400
+ def test_custom_filter()
401
+ input = "This text SHOULD be capitalized ... I hope. :-S"
402
+ t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
403
+ assert_equal(Token.new("This", 0, 4), t.next)
404
+ assert_equal(Token.new("Text", 5, 9), t.next)
405
+ assert_equal(Token.new("SHOULD", 10, 16), t.next)
406
+ assert_equal(Token.new("Be", 17, 19), t.next)
407
+ assert_equal(Token.new("Capitalized", 20, 31), t.next)
408
+ assert_equal(Token.new("I", 36, 37), t.next)
409
+ assert_equal(Token.new("Hope", 38, 42), t.next)
410
+ assert_equal(Token.new("S", 46, 47), t.next)
411
+ assert(! t.next())
412
+ t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
413
+ assert_equal(Token.new("This", 0, 4), t.next)
414
+ assert_equal(Token.new("Text", 5, 9), t.next)
415
+ assert_equal(Token.new("SHOULD", 10, 16), t.next)
416
+ assert_equal(Token.new("Be", 17, 19), t.next)
417
+ assert_equal(Token.new("Capit", 20, 31), t.next)
418
+ assert_equal(Token.new("I", 36, 37), t.next)
419
+ assert_equal(Token.new("Hope", 38, 42), t.next)
420
+ assert_equal(Token.new("S", 46, 47), t.next)
421
+ assert(! t.next())
422
+ end
423
+ end
@@ -2,10 +2,9 @@ require File.dirname(__FILE__) + "/../../test_helper"
2
2
 
3
3
  class AnalyzerTest < Test::Unit::TestCase
4
4
  include Ferret::Analysis
5
- include Ferret::Utils::StringHelper
6
5
 
7
6
  def test_analyzer()
8
- input = StringReader.new('DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$')
7
+ input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
9
8
  a = Analyzer.new()
10
9
  t = a.token_stream("fieldname", input)
11
10
  assert_equal(Token.new("dbalmain", 0, 8), t.next())