ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,76 @@
1
+ require File.dirname(__FILE__) + "/../test_helper"
2
+ require 'benchmark'
3
+
4
+ class RAMStoreTest < Test::Unit::TestCase
5
+ def setup
6
+ @dir = Ferret::Store::RAMDirectory.new
7
+ end
8
+
9
+ def teardown
10
+ @dir.close()
11
+ end
12
+
13
+ def test_rw_bytes
14
+ bytes = [0x34, 0x87, 0xF9, 0xEA, 0x00, 0xFF]
15
+ rw_test(bytes, "byte")
16
+ end
17
+
18
+ def test_rw_ints
19
+ ints = [-2147483648, 2147483647, -1, 0]
20
+ rw_test(ints, "int")
21
+ end
22
+
23
+ def test_rw_longs
24
+ longs = [-9223372036854775808, 9223372036854775807, -1, 0]
25
+ rw_test(longs, "long")
26
+ end
27
+
28
+ def test_rw_uints
29
+ uints = [0xffffffff, 100000, 0]
30
+ rw_test(uints, "uint")
31
+ end
32
+
33
+ def test_rw_ulongs
34
+ ulongs = [0xffffffffffffffff, 100000000000000, 0]
35
+ rw_test(ulongs, "ulong")
36
+ end
37
+
38
+ def test_rw_vints
39
+ vints = [ 0xF8DC843342FE3484234987FE98AB987C897D214D123D123458EFBE2E238BACDEB9878790ABCDEF123DEF23988B89C,
40
+ 0x0000000000000000000000000000000000000000,
41
+ 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF]
42
+ rw_test(vints, "vint")
43
+ end
44
+
45
+ def test_rw_vlongs
46
+ vlongs = [ 0xF8DC843342FE3484234987FE98AB987C897D214D123D123458EFBE2E238BACDEB9878790ABCDEF123DEF23988B89C,
47
+ 0x0000000000000000000000000000000000000000,
48
+ 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF]
49
+ rw_test(vlongs, "vlong")
50
+ end
51
+
52
+ def test_rw_strings
53
+ strings = ['This is a ruby ferret test string ~!@#$%^&*()`123456790-=\)_+|', 'This is another string. I\'ll make this one a little longer than the last one. But I guess we need a few shorter ones too.', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
54
+ rw_test(strings, "string")
55
+ end
56
+
57
+ # this test fills up the output stream so that the buffer will have to be
58
+ # written a few times. It then uses seek to make sure that it works
59
+ # correctly
60
+
61
+ def rw_test(values, type)
62
+ puts "\nrw_#{type} test"
63
+ Benchmark.bmbm do |x|
64
+ x.report("write") do
65
+ ostream = @dir.create_output("rw_#{type}.test")
66
+ 1000.times {values.each { |b| ostream.__send__("write_" + type, b) }}
67
+ ostream.close
68
+ end
69
+ x.report("read") do
70
+ istream = @dir.open_input("rw_#{type}.test")
71
+ 1000.times {values.each { |b| assert_equal(b, istream.__send__("read_" + type), "#{type} should be equal") }}
72
+ istream.close
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,26 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), '../../lib')
2
+
3
+ require 'ferret'
4
+
5
+ vints = [ 9223372036854775807,
6
+ 0x00,
7
+ 0xFFFFFFFFFFFFFFFF]
8
+ t = Time.new
9
+ 10.times do
10
+ dpath = File.join(File.dirname(__FILE__),
11
+ 'fsdir')
12
+ dir = Ferret::Store::FSDirectory.get_directory(dpath, true)
13
+
14
+ 100.times do
15
+ ostream = dir.create_output("rw_vint.test")
16
+ 300.times { |i| ostream.write_vint(vints[i%3]) }
17
+ ostream.close
18
+ istream = dir.open_input("rw_vint.test")
19
+ 300.times { istream.read_vint }
20
+ istream.close
21
+ end
22
+
23
+ dir.close
24
+ end
25
+
26
+ puts "took #{Time.new - t} seconds"
@@ -0,0 +1,60 @@
1
+ require File.dirname(__FILE__) + "/../../../test_helper"
2
+
3
+
4
+ class NumberToolsTest < Test::Unit::TestCase
5
+ include Lucene::Document
6
+ def test_near_zero()
7
+ 10.times() do |i|
8
+ 10.times() { |j| subtest_two_longs(i, j) }
9
+ end
10
+ end
11
+
12
+ def test_max()
13
+ # make sure the constants convert to their equivelents
14
+ assert_equal(NumberTools::LONG_MAX_VALUE, NumberTools.s_to_long(NumberTools::MAX_STRING_VALUE))
15
+ assert_equal(NumberTools::MAX_STRING_VALUE, NumberTools.long_to_s(NumberTools::LONG_MAX_VALUE))
16
+ # test near MAX, too
17
+
18
+ NumberTools::LONG_MAX_VALUE.downto(NumberTools::LONG_MAX_VALUE - 100) do |l|
19
+ subtest_two_longs(l, l - 1)
20
+ end
21
+ end
22
+
23
+ def test_min()
24
+ # make sure the constants convert to their equivelents
25
+ assert_equal(NumberTools::LONG_MIN_VALUE, NumberTools.s_to_long(NumberTools::MIN_STRING_VALUE))
26
+ assert_equal(NumberTools::MIN_STRING_VALUE, NumberTools.long_to_s(NumberTools::LONG_MIN_VALUE))
27
+
28
+ # test near MIN, too
29
+ NumberTools::LONG_MIN_VALUE.upto(NumberTools::LONG_MIN_VALUE + 100) do |l|
30
+ subtest_two_longs(l, l + 1)
31
+ end
32
+ end
33
+
34
+ def subtest_two_longs(i, j)
35
+ # convert to strings
36
+ a = NumberTools.long_to_s(i)
37
+ b = NumberTools.long_to_s(j)
38
+
39
+ # are they the right length?
40
+ assert_equal(NumberTools::STR_SIZE, a.length())
41
+ assert_equal(NumberTools::STR_SIZE, b.length())
42
+
43
+ # are they the right order?
44
+ if (i < j)
45
+ assert(a < b)
46
+ elsif (i > j)
47
+ assert(a > b)
48
+ else
49
+ assert_equal(a, b)
50
+ end
51
+
52
+ # can we convert them back to longs?
53
+ i2 = NumberTools.s_to_long(a)
54
+ j2 = NumberTools.s_to_long(b)
55
+
56
+ assert_equal(i, i2)
57
+ assert_equal(j, j2)
58
+ end
59
+
60
+ end
@@ -0,0 +1,19 @@
1
+ module StoreTest
2
+ # declare dir so inheritors can access it.
3
+ def test_modified_full
4
+ # difficult to test this one but as file mtime is only stored to the
5
+ # nearest second. We can assume this test will happen in less than one
6
+ # second. (I hope)
7
+ time = Time.new.to_i
8
+ @dir.touch('mtime_test')
9
+ time_before = @dir.modified('mtime_test').to_i
10
+ assert(time_before - time <= 2, "test that mtime is approximately equal to the system time when the file was touched")
11
+ # wait until the time ticks over one second.
12
+ time = Time.new while (time.to_i == time_before)
13
+ time_before_again = @dir.modified('mtime_test').to_i
14
+ assert_equal(time_before, time_before_again, "the modified time shouldn't change")
15
+ @dir.touch('mtime_test')
16
+ time_after = @dir.modified('mtime_test').to_i
17
+ assert(time_before < time_after, "the modified time should now be greater")
18
+ end
19
+ end
@@ -0,0 +1,9 @@
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require 'unit/ts_analysis.rb'
4
+ require 'unit/ts_document.rb'
5
+ require 'unit/ts_index.rb'
6
+ require 'unit/ts_query_parser.rb'
7
+ require 'unit/ts_search.rb'
8
+ require 'unit/ts_store.rb'
9
+ require 'unit/ts_utils.rb'
@@ -0,0 +1,6 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), '../lib')
2
+ $:.unshift File.join(File.dirname(__FILE__), '../ext')
3
+
4
+ require 'test/unit'
5
+ require 'ferret'
6
+ require 'test/unit/index/th_doc'
@@ -0,0 +1,21 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class AnalyzerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_analyzer()
8
+ input = StringReader.new('DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$')
9
+ a = Analyzer.new()
10
+ t = a.token_stream("fieldname", input)
11
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
12
+ assert_equal(Token.new("gmail", 9, 14), t.next())
13
+ assert_equal(Token.new("com", 15, 18), t.next())
14
+ assert_equal(Token.new("is", 19, 21), t.next())
15
+ assert_equal(Token.new("my", 22, 24), t.next())
16
+ assert_equal(Token.new("e", 25, 26), t.next())
17
+ assert_equal(Token.new("mail", 27, 31), t.next())
18
+ assert_equal(Token.new("address", 39, 46), t.next())
19
+ assert(! t.next())
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class LetterTokenizerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_lettertokenizer()
8
+ input = StringReader.new('DBalmain@gmail.com is My e-mail 523@#$ address. 23#@$')
9
+ t = LetterTokenizer.new(input)
10
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
11
+ assert_equal(Token.new("gmail", 9, 14), t.next())
12
+ assert_equal(Token.new("com", 15, 18), t.next())
13
+ assert_equal(Token.new("is", 19, 21), t.next())
14
+ assert_equal(Token.new("My", 22, 24), t.next())
15
+ assert_equal(Token.new("e", 25, 26), t.next())
16
+ assert_equal(Token.new("mail", 27, 31), t.next())
17
+ assert_equal(Token.new("address", 39, 46), t.next())
18
+ assert(! t.next())
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class LowerCaseFilterTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_lowercasefilter()
8
+ input = StringReader.new('DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23#@$')
9
+ t = LowerCaseFilter.new(WhiteSpaceTokenizer.new(input))
10
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next())
11
+ assert_equal(Token.new('is', 19, 21), t.next())
12
+ assert_equal(Token.new('my', 22, 24), t.next())
13
+ assert_equal(Token.new('e-mail', 25, 31), t.next())
14
+ assert_equal(Token.new('52', 32, 34), t.next())
15
+ assert_equal(Token.new('#$', 37, 39), t.next())
16
+ assert_equal(Token.new('address.', 40, 48), t.next())
17
+ assert_equal(Token.new('23#@$', 49, 54), t.next())
18
+ assert(! t.next())
19
+ end
20
+ end
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class LowerCaseTokenizerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_normalize()
8
+ lt = LowerCaseTokenizer.new(StringReader.new(""))
9
+ assert_equal('!', lt.__send__(:normalize,"!"))
10
+ assert_equal('r', lt.__send__(:normalize,"r"))
11
+ assert_equal('r', lt.__send__(:normalize,"R"))
12
+ end
13
+
14
+ def test_lowercase_tokenizer()
15
+ input = StringReader.new('DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$')
16
+ t = LowerCaseTokenizer.new(input)
17
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
18
+ assert_equal(Token.new("gmail", 9, 14), t.next())
19
+ assert_equal(Token.new("com", 15, 18), t.next())
20
+ assert_equal(Token.new("is", 19, 21), t.next())
21
+ assert_equal(Token.new("my", 22, 24), t.next())
22
+ assert_equal(Token.new("e", 25, 26), t.next())
23
+ assert_equal(Token.new("mail", 27, 31), t.next())
24
+ assert_equal(Token.new("address", 39, 46), t.next())
25
+ assert(! t.next())
26
+ end
27
+ end
@@ -0,0 +1,39 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class PerFieldAnalyzerWrapperTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+ def test_perfieldanalyzerwrapper()
7
+ aw = PerFieldAnalyzerWrapper.new(Analyzer.new())
8
+ aw.add_analyzer("abstract", WhiteSpaceAnalyzer.new())
9
+ aw.add_analyzer("body", StopAnalyzer.new(['is', 'my', 'address']))
10
+ input = StringReader.new('DBalmain@gmail.com is My e-mail ADDRESS')
11
+ t = aw.token_stream("title", input)
12
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
13
+ assert_equal(Token.new("gmail", 9, 14), t.next())
14
+ assert_equal(Token.new("com", 15, 18), t.next())
15
+ assert_equal(Token.new("is", 19, 21), t.next())
16
+ assert_equal(Token.new("my", 22, 24), t.next())
17
+ assert_equal(Token.new("e", 25, 26), t.next())
18
+ assert_equal(Token.new("mail", 27, 31), t.next())
19
+ assert_equal(Token.new("address", 32, 39), t.next())
20
+ assert(! t.next())
21
+ input.reset()
22
+ t = aw.token_stream("abstract", input)
23
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next())
24
+ assert_equal(Token.new('is', 19, 21), t.next())
25
+ assert_equal(Token.new('My', 22, 24), t.next())
26
+ assert_equal(Token.new('e-mail', 25, 31), t.next())
27
+ assert_equal(Token.new("ADDRESS", 32, 39), t.next())
28
+ if ( token = t.next()): puts token.term_text end
29
+ assert(! t.next())
30
+ input.reset()
31
+ t = aw.token_stream("body", input)
32
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
33
+ assert_equal(Token.new("gmail", 9, 14), t.next())
34
+ assert_equal(Token.new("com", 15, 18), t.next())
35
+ assert_equal(Token.new("e", 25, 26), t.next())
36
+ assert_equal(Token.new("mail", 27, 31), t.next())
37
+ assert(! t.next())
38
+ end
39
+ end
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class PorterStemFilterTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_porterstempfilter()
8
+ input = StringReader.new('breath Breathes BreatHed BREATHING')
9
+ t = PorterStemFilter.new(LowerCaseFilter.new(WhiteSpaceTokenizer.new(input)))
10
+ assert_equal(Token.new('breath', 0, 6), t.next())
11
+ assert_equal(Token.new('breath', 7, 15), t.next())
12
+ assert_equal(Token.new('breath', 16, 24), t.next())
13
+ assert_equal(Token.new('breath', 25, 34), t.next())
14
+ assert(! t.next())
15
+ end
16
+ end
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class StandardAnalyzerTest < Test::Unit::TestCase
4
+ include Ferret::Utils::StringHelper
5
+ include Ferret::Analysis
6
+
7
+ def test_lettertokenizer()
8
+ input = StringReader.new('D.Ba_l-n@gma-l.com AB&Sons Toys\'r\'us you\'re she\'s, #$%^$%*& job@dot I.B.M. the an AnD THEIR')
9
+ sa = StandardAnalyzer.new()
10
+ t = sa.token_stream("field", input)
11
+ assert_equal(Token.new("d.ba_l-n@gma-l.com", 0, 18), t.next())
12
+ assert_equal(Token.new("ab&sons", 19, 26), t.next())
13
+ assert_equal(Token.new("toys'r'us", 27, 36), t.next())
14
+ assert_equal(Token.new("you're", 37, 43), t.next())
15
+ assert_equal(Token.new("she", 44, 49), t.next())
16
+ assert_equal(Token.new("job@dot", 60, 67), t.next())
17
+ assert_equal(Token.new("ibm", 68, 74), t.next())
18
+ assert(! t.next())
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class StandardTokenizerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_lettertokenizer()
8
+ input = StringReader.new('DBalmain@gmail.com is My e-mail 523@#$ address. 23#@$')
9
+ t = StandardTokenizer.new(input)
10
+ assert_equal(Token.new("DBalmain@gmail.com", 0, 18), t.next())
11
+ assert_equal(Token.new("is", 19, 21), t.next())
12
+ assert_equal(Token.new("My", 22, 24), t.next())
13
+ assert_equal(Token.new("e", 25, 26), t.next())
14
+ assert_equal(Token.new("mail", 27, 31), t.next())
15
+ assert_equal(Token.new("523", 32, 35), t.next())
16
+ assert_equal(Token.new("address", 39, 46), t.next())
17
+ assert_equal(Token.new("23", 48, 50), t.next())
18
+ assert(! t.next())
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class StopAnalyzerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_stopanalyzer()
8
+ input = StringReader.new('The Quick AND the DEAD the and to it there their')
9
+ a = StopAnalyzer.new()
10
+ t = a.token_stream("field name", input)
11
+ assert_equal(Token.new('quick', 4, 9), t.next())
12
+ assert_equal(Token.new('dead', 18, 22), t.next())
13
+ assert(! t.next())
14
+ input = StringReader.new("David Balmain")
15
+ a = StopAnalyzer.new(["david"])
16
+ t = a.token_stream("field name", input)
17
+ assert_equal(Token.new('balmain', 6, 13), t.next())
18
+ assert(! t.next())
19
+ end
20
+ end
@@ -0,0 +1,14 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class StopFilterTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_stopfilter()
8
+ input = StringReader.new('The Quick AND the DEAD the and to it there their')
9
+ t = StopFilter.new_with_file(LowerCaseTokenizer.new(input), File.dirname(__FILE__) + '/data/wordfile')
10
+ assert_equal(Token.new('quick', 4, 9), t.next())
11
+ assert_equal(Token.new('dead', 18, 22), t.next())
12
+ assert(! t.next())
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_whitespaceanalyzer()
8
+ input = StringReader.new('DBalmain@gmail.com is My e-mail 52 #$ address. 23#@$')
9
+ a = WhiteSpaceAnalyzer.new()
10
+ t = a.token_stream("field", input)
11
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next())
12
+ assert_equal(Token.new('is', 19, 21), t.next())
13
+ assert_equal(Token.new('My', 22, 24), t.next())
14
+ assert_equal(Token.new('e-mail', 25, 31), t.next())
15
+ assert_equal(Token.new('52', 32, 34), t.next())
16
+ assert_equal(Token.new('#$', 37, 39), t.next())
17
+ assert_equal(Token.new('address.', 40, 48), t.next())
18
+ assert_equal(Token.new('23#@$', 49, 54), t.next())
19
+ assert(! t.next())
20
+ end
21
+ end