ferret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../test_helper"
|
2
|
+
require 'benchmark'
|
3
|
+
|
4
|
+
class RAMStoreTest < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
@dir = Ferret::Store::RAMDirectory.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def teardown
|
10
|
+
@dir.close()
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_rw_bytes
|
14
|
+
bytes = [0x34, 0x87, 0xF9, 0xEA, 0x00, 0xFF]
|
15
|
+
rw_test(bytes, "byte")
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_rw_ints
|
19
|
+
ints = [-2147483648, 2147483647, -1, 0]
|
20
|
+
rw_test(ints, "int")
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_rw_longs
|
24
|
+
longs = [-9223372036854775808, 9223372036854775807, -1, 0]
|
25
|
+
rw_test(longs, "long")
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_rw_uints
|
29
|
+
uints = [0xffffffff, 100000, 0]
|
30
|
+
rw_test(uints, "uint")
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_rw_ulongs
|
34
|
+
ulongs = [0xffffffffffffffff, 100000000000000, 0]
|
35
|
+
rw_test(ulongs, "ulong")
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_rw_vints
|
39
|
+
vints = [ 0xF8DC843342FE3484234987FE98AB987C897D214D123D123458EFBE2E238BACDEB9878790ABCDEF123DEF23988B89C,
|
40
|
+
0x0000000000000000000000000000000000000000,
|
41
|
+
0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF]
|
42
|
+
rw_test(vints, "vint")
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_rw_vlongs
|
46
|
+
vlongs = [ 0xF8DC843342FE3484234987FE98AB987C897D214D123D123458EFBE2E238BACDEB9878790ABCDEF123DEF23988B89C,
|
47
|
+
0x0000000000000000000000000000000000000000,
|
48
|
+
0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF]
|
49
|
+
rw_test(vlongs, "vlong")
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_rw_strings
|
53
|
+
strings = ['This is a ruby ferret test string ~!@#$%^&*()`123456790-=\)_+|', 'This is another string. I\'ll make this one a little longer than the last one. But I guess we need a few shorter ones too.', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
|
54
|
+
rw_test(strings, "string")
|
55
|
+
end
|
56
|
+
|
57
|
+
# this test fills up the output stream so that the buffer will have to be
|
58
|
+
# written a few times. It then uses seek to make sure that it works
|
59
|
+
# correctly
|
60
|
+
|
61
|
+
def rw_test(values, type)
|
62
|
+
puts "\nrw_#{type} test"
|
63
|
+
Benchmark.bmbm do |x|
|
64
|
+
x.report("write") do
|
65
|
+
ostream = @dir.create_output("rw_#{type}.test")
|
66
|
+
1000.times {values.each { |b| ostream.__send__("write_" + type, b) }}
|
67
|
+
ostream.close
|
68
|
+
end
|
69
|
+
x.report("read") do
|
70
|
+
istream = @dir.open_input("rw_#{type}.test")
|
71
|
+
1000.times {values.each { |b| assert_equal(b, istream.__send__("read_" + type), "#{type} should be equal") }}
|
72
|
+
istream.close
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), '../../lib')
|
2
|
+
|
3
|
+
require 'ferret'
|
4
|
+
|
5
|
+
vints = [ 9223372036854775807,
|
6
|
+
0x00,
|
7
|
+
0xFFFFFFFFFFFFFFFF]
|
8
|
+
t = Time.new
|
9
|
+
10.times do
|
10
|
+
dpath = File.join(File.dirname(__FILE__),
|
11
|
+
'fsdir')
|
12
|
+
dir = Ferret::Store::FSDirectory.get_directory(dpath, true)
|
13
|
+
|
14
|
+
100.times do
|
15
|
+
ostream = dir.create_output("rw_vint.test")
|
16
|
+
300.times { |i| ostream.write_vint(vints[i%3]) }
|
17
|
+
ostream.close
|
18
|
+
istream = dir.open_input("rw_vint.test")
|
19
|
+
300.times { istream.read_vint }
|
20
|
+
istream.close
|
21
|
+
end
|
22
|
+
|
23
|
+
dir.close
|
24
|
+
end
|
25
|
+
|
26
|
+
puts "took #{Time.new - t} seconds"
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../../test_helper"
|
2
|
+
|
3
|
+
|
4
|
+
class NumberToolsTest < Test::Unit::TestCase
|
5
|
+
include Lucene::Document
|
6
|
+
def test_near_zero()
|
7
|
+
10.times() do |i|
|
8
|
+
10.times() { |j| subtest_two_longs(i, j) }
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_max()
|
13
|
+
# make sure the constants convert to their equivelents
|
14
|
+
assert_equal(NumberTools::LONG_MAX_VALUE, NumberTools.s_to_long(NumberTools::MAX_STRING_VALUE))
|
15
|
+
assert_equal(NumberTools::MAX_STRING_VALUE, NumberTools.long_to_s(NumberTools::LONG_MAX_VALUE))
|
16
|
+
# test near MAX, too
|
17
|
+
|
18
|
+
NumberTools::LONG_MAX_VALUE.downto(NumberTools::LONG_MAX_VALUE - 100) do |l|
|
19
|
+
subtest_two_longs(l, l - 1)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_min()
|
24
|
+
# make sure the constants convert to their equivelents
|
25
|
+
assert_equal(NumberTools::LONG_MIN_VALUE, NumberTools.s_to_long(NumberTools::MIN_STRING_VALUE))
|
26
|
+
assert_equal(NumberTools::MIN_STRING_VALUE, NumberTools.long_to_s(NumberTools::LONG_MIN_VALUE))
|
27
|
+
|
28
|
+
# test near MIN, too
|
29
|
+
NumberTools::LONG_MIN_VALUE.upto(NumberTools::LONG_MIN_VALUE + 100) do |l|
|
30
|
+
subtest_two_longs(l, l + 1)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def subtest_two_longs(i, j)
|
35
|
+
# convert to strings
|
36
|
+
a = NumberTools.long_to_s(i)
|
37
|
+
b = NumberTools.long_to_s(j)
|
38
|
+
|
39
|
+
# are they the right length?
|
40
|
+
assert_equal(NumberTools::STR_SIZE, a.length())
|
41
|
+
assert_equal(NumberTools::STR_SIZE, b.length())
|
42
|
+
|
43
|
+
# are they the right order?
|
44
|
+
if (i < j)
|
45
|
+
assert(a < b)
|
46
|
+
elsif (i > j)
|
47
|
+
assert(a > b)
|
48
|
+
else
|
49
|
+
assert_equal(a, b)
|
50
|
+
end
|
51
|
+
|
52
|
+
# can we convert them back to longs?
|
53
|
+
i2 = NumberTools.s_to_long(a)
|
54
|
+
j2 = NumberTools.s_to_long(b)
|
55
|
+
|
56
|
+
assert_equal(i, i2)
|
57
|
+
assert_equal(j, j2)
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module StoreTest
|
2
|
+
# declare dir so inheritors can access it.
|
3
|
+
def test_modified_full
|
4
|
+
# difficult to test this one but as file mtime is only stored to the
|
5
|
+
# nearest second. We can assume this test will happen in less than one
|
6
|
+
# second. (I hope)
|
7
|
+
time = Time.new.to_i
|
8
|
+
@dir.touch('mtime_test')
|
9
|
+
time_before = @dir.modified('mtime_test').to_i
|
10
|
+
assert(time_before - time <= 2, "test that mtime is approximately equal to the system time when the file was touched")
|
11
|
+
# wait until the time ticks over one second.
|
12
|
+
time = Time.new while (time.to_i == time_before)
|
13
|
+
time_before_again = @dir.modified('mtime_test').to_i
|
14
|
+
assert_equal(time_before, time_before_again, "the modified time shouldn't change")
|
15
|
+
@dir.touch('mtime_test')
|
16
|
+
time_after = @dir.modified('mtime_test').to_i
|
17
|
+
assert(time_before < time_after, "the modified time should now be greater")
|
18
|
+
end
|
19
|
+
end
|
data/test/test_all.rb
ADDED
data/test/test_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class AnalyzerTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_analyzer()
|
8
|
+
input = StringReader.new('DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$')
|
9
|
+
a = Analyzer.new()
|
10
|
+
t = a.token_stream("fieldname", input)
|
11
|
+
assert_equal(Token.new("dbalmain", 0, 8), t.next())
|
12
|
+
assert_equal(Token.new("gmail", 9, 14), t.next())
|
13
|
+
assert_equal(Token.new("com", 15, 18), t.next())
|
14
|
+
assert_equal(Token.new("is", 19, 21), t.next())
|
15
|
+
assert_equal(Token.new("my", 22, 24), t.next())
|
16
|
+
assert_equal(Token.new("e", 25, 26), t.next())
|
17
|
+
assert_equal(Token.new("mail", 27, 31), t.next())
|
18
|
+
assert_equal(Token.new("address", 39, 46), t.next())
|
19
|
+
assert(! t.next())
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class LetterTokenizerTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_lettertokenizer()
|
8
|
+
input = StringReader.new('DBalmain@gmail.com is My e-mail 523@#$ address. 23#@$')
|
9
|
+
t = LetterTokenizer.new(input)
|
10
|
+
assert_equal(Token.new("DBalmain", 0, 8), t.next())
|
11
|
+
assert_equal(Token.new("gmail", 9, 14), t.next())
|
12
|
+
assert_equal(Token.new("com", 15, 18), t.next())
|
13
|
+
assert_equal(Token.new("is", 19, 21), t.next())
|
14
|
+
assert_equal(Token.new("My", 22, 24), t.next())
|
15
|
+
assert_equal(Token.new("e", 25, 26), t.next())
|
16
|
+
assert_equal(Token.new("mail", 27, 31), t.next())
|
17
|
+
assert_equal(Token.new("address", 39, 46), t.next())
|
18
|
+
assert(! t.next())
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class LowerCaseFilterTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_lowercasefilter()
|
8
|
+
input = StringReader.new('DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23#@$')
|
9
|
+
t = LowerCaseFilter.new(WhiteSpaceTokenizer.new(input))
|
10
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next())
|
11
|
+
assert_equal(Token.new('is', 19, 21), t.next())
|
12
|
+
assert_equal(Token.new('my', 22, 24), t.next())
|
13
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next())
|
14
|
+
assert_equal(Token.new('52', 32, 34), t.next())
|
15
|
+
assert_equal(Token.new('#$', 37, 39), t.next())
|
16
|
+
assert_equal(Token.new('address.', 40, 48), t.next())
|
17
|
+
assert_equal(Token.new('23#@$', 49, 54), t.next())
|
18
|
+
assert(! t.next())
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class LowerCaseTokenizerTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_normalize()
|
8
|
+
lt = LowerCaseTokenizer.new(StringReader.new(""))
|
9
|
+
assert_equal('!', lt.__send__(:normalize,"!"))
|
10
|
+
assert_equal('r', lt.__send__(:normalize,"r"))
|
11
|
+
assert_equal('r', lt.__send__(:normalize,"R"))
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_lowercase_tokenizer()
|
15
|
+
input = StringReader.new('DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$')
|
16
|
+
t = LowerCaseTokenizer.new(input)
|
17
|
+
assert_equal(Token.new("dbalmain", 0, 8), t.next())
|
18
|
+
assert_equal(Token.new("gmail", 9, 14), t.next())
|
19
|
+
assert_equal(Token.new("com", 15, 18), t.next())
|
20
|
+
assert_equal(Token.new("is", 19, 21), t.next())
|
21
|
+
assert_equal(Token.new("my", 22, 24), t.next())
|
22
|
+
assert_equal(Token.new("e", 25, 26), t.next())
|
23
|
+
assert_equal(Token.new("mail", 27, 31), t.next())
|
24
|
+
assert_equal(Token.new("address", 39, 46), t.next())
|
25
|
+
assert(! t.next())
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class PerFieldAnalyzerWrapperTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
def test_perfieldanalyzerwrapper()
|
7
|
+
aw = PerFieldAnalyzerWrapper.new(Analyzer.new())
|
8
|
+
aw.add_analyzer("abstract", WhiteSpaceAnalyzer.new())
|
9
|
+
aw.add_analyzer("body", StopAnalyzer.new(['is', 'my', 'address']))
|
10
|
+
input = StringReader.new('DBalmain@gmail.com is My e-mail ADDRESS')
|
11
|
+
t = aw.token_stream("title", input)
|
12
|
+
assert_equal(Token.new("dbalmain", 0, 8), t.next())
|
13
|
+
assert_equal(Token.new("gmail", 9, 14), t.next())
|
14
|
+
assert_equal(Token.new("com", 15, 18), t.next())
|
15
|
+
assert_equal(Token.new("is", 19, 21), t.next())
|
16
|
+
assert_equal(Token.new("my", 22, 24), t.next())
|
17
|
+
assert_equal(Token.new("e", 25, 26), t.next())
|
18
|
+
assert_equal(Token.new("mail", 27, 31), t.next())
|
19
|
+
assert_equal(Token.new("address", 32, 39), t.next())
|
20
|
+
assert(! t.next())
|
21
|
+
input.reset()
|
22
|
+
t = aw.token_stream("abstract", input)
|
23
|
+
assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next())
|
24
|
+
assert_equal(Token.new('is', 19, 21), t.next())
|
25
|
+
assert_equal(Token.new('My', 22, 24), t.next())
|
26
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next())
|
27
|
+
assert_equal(Token.new("ADDRESS", 32, 39), t.next())
|
28
|
+
if ( token = t.next()): puts token.term_text end
|
29
|
+
assert(! t.next())
|
30
|
+
input.reset()
|
31
|
+
t = aw.token_stream("body", input)
|
32
|
+
assert_equal(Token.new("dbalmain", 0, 8), t.next())
|
33
|
+
assert_equal(Token.new("gmail", 9, 14), t.next())
|
34
|
+
assert_equal(Token.new("com", 15, 18), t.next())
|
35
|
+
assert_equal(Token.new("e", 25, 26), t.next())
|
36
|
+
assert_equal(Token.new("mail", 27, 31), t.next())
|
37
|
+
assert(! t.next())
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class PorterStemFilterTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_porterstempfilter()
|
8
|
+
input = StringReader.new('breath Breathes BreatHed BREATHING')
|
9
|
+
t = PorterStemFilter.new(LowerCaseFilter.new(WhiteSpaceTokenizer.new(input)))
|
10
|
+
assert_equal(Token.new('breath', 0, 6), t.next())
|
11
|
+
assert_equal(Token.new('breath', 7, 15), t.next())
|
12
|
+
assert_equal(Token.new('breath', 16, 24), t.next())
|
13
|
+
assert_equal(Token.new('breath', 25, 34), t.next())
|
14
|
+
assert(! t.next())
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class StandardAnalyzerTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Utils::StringHelper
|
5
|
+
include Ferret::Analysis
|
6
|
+
|
7
|
+
def test_lettertokenizer()
|
8
|
+
input = StringReader.new('D.Ba_l-n@gma-l.com AB&Sons Toys\'r\'us you\'re she\'s, #$%^$%*& job@dot I.B.M. the an AnD THEIR')
|
9
|
+
sa = StandardAnalyzer.new()
|
10
|
+
t = sa.token_stream("field", input)
|
11
|
+
assert_equal(Token.new("d.ba_l-n@gma-l.com", 0, 18), t.next())
|
12
|
+
assert_equal(Token.new("ab&sons", 19, 26), t.next())
|
13
|
+
assert_equal(Token.new("toys'r'us", 27, 36), t.next())
|
14
|
+
assert_equal(Token.new("you're", 37, 43), t.next())
|
15
|
+
assert_equal(Token.new("she", 44, 49), t.next())
|
16
|
+
assert_equal(Token.new("job@dot", 60, 67), t.next())
|
17
|
+
assert_equal(Token.new("ibm", 68, 74), t.next())
|
18
|
+
assert(! t.next())
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class StandardTokenizerTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_lettertokenizer()
|
8
|
+
input = StringReader.new('DBalmain@gmail.com is My e-mail 523@#$ address. 23#@$')
|
9
|
+
t = StandardTokenizer.new(input)
|
10
|
+
assert_equal(Token.new("DBalmain@gmail.com", 0, 18), t.next())
|
11
|
+
assert_equal(Token.new("is", 19, 21), t.next())
|
12
|
+
assert_equal(Token.new("My", 22, 24), t.next())
|
13
|
+
assert_equal(Token.new("e", 25, 26), t.next())
|
14
|
+
assert_equal(Token.new("mail", 27, 31), t.next())
|
15
|
+
assert_equal(Token.new("523", 32, 35), t.next())
|
16
|
+
assert_equal(Token.new("address", 39, 46), t.next())
|
17
|
+
assert_equal(Token.new("23", 48, 50), t.next())
|
18
|
+
assert(! t.next())
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class StopAnalyzerTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_stopanalyzer()
|
8
|
+
input = StringReader.new('The Quick AND the DEAD the and to it there their')
|
9
|
+
a = StopAnalyzer.new()
|
10
|
+
t = a.token_stream("field name", input)
|
11
|
+
assert_equal(Token.new('quick', 4, 9), t.next())
|
12
|
+
assert_equal(Token.new('dead', 18, 22), t.next())
|
13
|
+
assert(! t.next())
|
14
|
+
input = StringReader.new("David Balmain")
|
15
|
+
a = StopAnalyzer.new(["david"])
|
16
|
+
t = a.token_stream("field name", input)
|
17
|
+
assert_equal(Token.new('balmain', 6, 13), t.next())
|
18
|
+
assert(! t.next())
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class StopFilterTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_stopfilter()
|
8
|
+
input = StringReader.new('The Quick AND the DEAD the and to it there their')
|
9
|
+
t = StopFilter.new_with_file(LowerCaseTokenizer.new(input), File.dirname(__FILE__) + '/data/wordfile')
|
10
|
+
assert_equal(Token.new('quick', 4, 9), t.next())
|
11
|
+
assert_equal(Token.new('dead', 18, 22), t.next())
|
12
|
+
assert(! t.next())
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Analysis
|
5
|
+
include Ferret::Utils::StringHelper
|
6
|
+
|
7
|
+
def test_whitespaceanalyzer()
|
8
|
+
input = StringReader.new('DBalmain@gmail.com is My e-mail 52 #$ address. 23#@$')
|
9
|
+
a = WhiteSpaceAnalyzer.new()
|
10
|
+
t = a.token_stream("field", input)
|
11
|
+
assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next())
|
12
|
+
assert_equal(Token.new('is', 19, 21), t.next())
|
13
|
+
assert_equal(Token.new('My', 22, 24), t.next())
|
14
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next())
|
15
|
+
assert_equal(Token.new('52', 32, 34), t.next())
|
16
|
+
assert_equal(Token.new('#$', 37, 39), t.next())
|
17
|
+
assert_equal(Token.new('address.', 40, 48), t.next())
|
18
|
+
assert_equal(Token.new('23#@$', 49, 54), t.next())
|
19
|
+
assert(! t.next())
|
20
|
+
end
|
21
|
+
end
|