ferret 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,84 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class QueryParserTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
|
6
|
+
def setup()
|
7
|
+
@parser = Ferret::QueryParser.new("xxx")
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_strings()
|
11
|
+
pairs = [
|
12
|
+
['word', 'word'],
|
13
|
+
['field:word', 'field:word'],
|
14
|
+
['"word1 word2 word3"', '"word word word"'],
|
15
|
+
['"word1 2342 word3"', '"word word"'],
|
16
|
+
['field:"one two three"', 'field:"one two three"'],
|
17
|
+
['field:"one 222 three"', 'field:"one three"'],
|
18
|
+
['field:"one <> three"', 'field:"one <> three"'],
|
19
|
+
['field:"one <> three <>"', 'field:"one <> three"'],
|
20
|
+
['field:"one <> <> <> three <>"', 'field:"one <> <> <> three"'],
|
21
|
+
['field:"one <> <> <> three|four|five <>"', 'field:"one <> <> <> three|four|five"'],
|
22
|
+
['field:"one|two three|four|five six|seven"', 'field:"one|two three|four|five six|seven"'],
|
23
|
+
['[aaa bbb]', '[aaa bbb]'],
|
24
|
+
['{aaa bbb]', '{aaa bbb]'],
|
25
|
+
['field:[aaa bbb}', 'field:[aaa bbb}'],
|
26
|
+
['{aaa bbb}', '{aaa bbb}'],
|
27
|
+
['{aaa|', '{aaa|'],
|
28
|
+
['[aaa|', '[aaa|'],
|
29
|
+
['field:|aaa}', 'field:|aaa}'],
|
30
|
+
['|aaa]', '|aaa]'],
|
31
|
+
['>aaa', '{aaa|'],
|
32
|
+
['>=aaa', '[aaa|'],
|
33
|
+
['<aaa', '|aaa}'],
|
34
|
+
['field:<=aaa', 'field:|aaa]'],
|
35
|
+
['REQ one REQ two', '+one +two'],
|
36
|
+
['REQ one two', '+one two'],
|
37
|
+
['one REQ two', 'one +two'],
|
38
|
+
['+one +two', '+one +two'],
|
39
|
+
['+one two', '+one two'],
|
40
|
+
['one +two', 'one +two'],
|
41
|
+
['-one -two', '-one -two'],
|
42
|
+
['-one two', '-one two'],
|
43
|
+
['one -two', 'one -two'],
|
44
|
+
['!one !two', '-one -two'],
|
45
|
+
['!one two', '-one two'],
|
46
|
+
['one !two', 'one -two'],
|
47
|
+
['NOT one NOT two', '-one -two'],
|
48
|
+
['NOT one two', '-one two'],
|
49
|
+
['one NOT two', 'one -two'],
|
50
|
+
['one two', 'one two'],
|
51
|
+
['one OR two', 'one two'],
|
52
|
+
['one AND two', '+one +two'],
|
53
|
+
['one two AND three', 'one two +three'],
|
54
|
+
['one two OR three', 'one two three'],
|
55
|
+
['one (two AND three)', 'one (+two +three)'],
|
56
|
+
['one AND (two OR three)', '+one +(two three)'],
|
57
|
+
['field:(one AND (two OR three))', '+field:one +(field:two field:three)'],
|
58
|
+
['one AND (two OR [aaa vvv})', '+one +(two [aaa vvv})'],
|
59
|
+
['one AND (one:two OR two:three) AND four', '+one +(one:two two:three) +four'],
|
60
|
+
['one^1.23', 'one^1.23'],
|
61
|
+
['(one AND two)^100.23', '(+one +two)^100.23'],
|
62
|
+
['field:(one AND two)^100.23', '(+field:one +field:two)^100.23'],
|
63
|
+
['field:(one AND [aaa bbb]^23.3)^100.23', '(+field:one +field:[aaa bbb]^23.3)^100.23'],
|
64
|
+
['(REQ field:"one two three")^23', 'field:"one two three"^23.0'],
|
65
|
+
['asdf~0.2', 'asdf~0.2'],
|
66
|
+
['field:asdf~0.2', 'field:asdf~0.2'],
|
67
|
+
['asdf~0.2^100.0', 'asdf~0.2^100.0'],
|
68
|
+
['field:asdf~0.2^0.1', 'field:asdf~0.2^0.1'],
|
69
|
+
['field:"asdf <> asdf|asdf"~4', 'field:"asdf <> asdf|asdf"~4'],
|
70
|
+
['"one two three four five"~5', '"one two three four five"~5'],
|
71
|
+
['ab?de', 'ab?de'],
|
72
|
+
['ab*de', 'ab*de'],
|
73
|
+
['asdf?*?asd*dsf?asfd*asdf?', 'asdf?*?asd*dsf?asfd*asdf?'],
|
74
|
+
['field:a* AND field:(b*)', '+field:a* +field:b*'],
|
75
|
+
['field:abc~ AND field:(b*)', '+field:abc~0.5 +field:b*'],
|
76
|
+
['asdf?*?asd*dsf?asfd*asdf?^20.0', 'asdf?*?asd*dsf?asfd*asdf?^20.0']
|
77
|
+
]
|
78
|
+
|
79
|
+
|
80
|
+
pairs.each do |pair|
|
81
|
+
assert_equal(pair[1], @parser.parse(pair[0]).to_s(@parser.default_field))
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
|
4
|
+
class FilterTest < Test::Unit::TestCase
|
5
|
+
include Ferret::Document
|
6
|
+
include Ferret::Search
|
7
|
+
include Ferret::Analysis
|
8
|
+
include Ferret::Index
|
9
|
+
|
10
|
+
def add_doc(hash, writer)
|
11
|
+
doc = Document.new()
|
12
|
+
hash.each_pair do |field, text|
|
13
|
+
doc << Field.new(field, text, Field::Store::NO, Field::Index::UNTOKENIZED)
|
14
|
+
end
|
15
|
+
writer << doc
|
16
|
+
end
|
17
|
+
|
18
|
+
def setup()
|
19
|
+
@dir = Ferret::Store::RAMDirectory.new()
|
20
|
+
iw = IndexWriter.new(@dir,
|
21
|
+
:analyzer => WhiteSpaceAnalyzer.new(),
|
22
|
+
:create => true)
|
23
|
+
docs = [
|
24
|
+
{"int"=>"0","date"=>"20040601","switch"=>"on"},
|
25
|
+
{"int"=>"1","date"=>"20041001","switch"=>"off"},
|
26
|
+
{"int"=>"2","date"=>"20051101","switch"=>"on"},
|
27
|
+
{"int"=>"3","date"=>"20041201","switch"=>"off"},
|
28
|
+
{"int"=>"4","date"=>"20051101","switch"=>"on"},
|
29
|
+
{"int"=>"5","date"=>"20041201","switch"=>"off"},
|
30
|
+
{"int"=>"6","date"=>"20050101","switch"=>"on"},
|
31
|
+
{"int"=>"7","date"=>"20040701","switch"=>"off"},
|
32
|
+
{"int"=>"8","date"=>"20050301","switch"=>"on"},
|
33
|
+
{"int"=>"9","date"=>"20050401","switch"=>"off"}
|
34
|
+
]
|
35
|
+
docs.each {|doc| add_doc(doc, iw)}
|
36
|
+
iw.close
|
37
|
+
end
|
38
|
+
|
39
|
+
def tear_down()
|
40
|
+
@dir.close()
|
41
|
+
end
|
42
|
+
|
43
|
+
def do_test_top_docs(is, query, expected, filter)
|
44
|
+
top_docs = is.search(query, {:filter => filter})
|
45
|
+
#puts top_docs
|
46
|
+
assert_equal(expected.size, top_docs.score_docs.size)
|
47
|
+
top_docs.total_hits.times do |i|
|
48
|
+
assert_equal(expected[i], top_docs.score_docs[i].doc)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_range_filter
|
53
|
+
is = IndexSearcher.new(@dir)
|
54
|
+
q = MatchAllDocsQuery.new()
|
55
|
+
rf = RangeFilter.new("int", "2", "6", true, true)
|
56
|
+
do_test_top_docs(is, q, [2,3,4,5,6], rf)
|
57
|
+
rf = RangeFilter.new("int", "2", "6", true, false)
|
58
|
+
do_test_top_docs(is, q, [2,3,4,5], rf)
|
59
|
+
rf = RangeFilter.new("int", "2", "6", false, true)
|
60
|
+
do_test_top_docs(is, q, [3,4,5,6], rf)
|
61
|
+
rf = RangeFilter.new("int", "2", "6", false, false)
|
62
|
+
do_test_top_docs(is, q, [3,4,5], rf)
|
63
|
+
rf = RangeFilter.new_more("int", "6")
|
64
|
+
do_test_top_docs(is, q, [6,7,8,9], rf)
|
65
|
+
rf = RangeFilter.new_more("int", "6", false)
|
66
|
+
do_test_top_docs(is, q, [7,8,9], rf)
|
67
|
+
rf = RangeFilter.new_less("int", "2")
|
68
|
+
do_test_top_docs(is, q, [0,1,2], rf)
|
69
|
+
rf = RangeFilter.new_less("int", "2", false)
|
70
|
+
do_test_top_docs(is, q, [0,1], rf)
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_range_filter_errors
|
74
|
+
assert_raise(ArgumentError) {f = RangeFilter.new("", "asd", nil, false, true)}
|
75
|
+
assert_raise(ArgumentError) {f = RangeFilter.new("", nil, "asd", true, false)}
|
76
|
+
assert_raise(ArgumentError) {f = RangeFilter.new("", "ac", "ab", false, false)}
|
77
|
+
assert_raise(ArgumentError) {f = RangeFilter.new("", nil, nil, false, false)}
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_query_filter()
|
81
|
+
is = IndexSearcher.new(@dir)
|
82
|
+
q = MatchAllDocsQuery.new()
|
83
|
+
qf = QueryFilter.new(TermQuery.new(Term.new("switch", "on")))
|
84
|
+
do_test_top_docs(is, q, [0,2,4,6,8], qf)
|
85
|
+
# test again to test caching doesn't break it
|
86
|
+
do_test_top_docs(is, q, [0,2,4,6,8], qf)
|
87
|
+
qf = QueryFilter.new(TermQuery.new(Term.new("switch", "off")))
|
88
|
+
do_test_top_docs(is, q, [1,3,5,7,9], qf)
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_caching_wrapper_filter
|
92
|
+
is = IndexSearcher.new(@dir)
|
93
|
+
q = MatchAllDocsQuery.new()
|
94
|
+
rf = RangeFilter.new("int", "2", "6", true, true)
|
95
|
+
cf = CachingWrapperFilter.new(rf)
|
96
|
+
#puts "about to test cache"
|
97
|
+
do_test_top_docs(is, q, [2,3,4,5,6], cf)
|
98
|
+
do_test_top_docs(is, q, [2,3,4,5,6], cf)
|
99
|
+
#puts "finished_testing_cache"
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_filtered_query
|
103
|
+
is = IndexSearcher.new(@dir)
|
104
|
+
q = MatchAllDocsQuery.new()
|
105
|
+
rf = RangeFilter.new("int", "2", "6", true, true)
|
106
|
+
rq = FilteredQuery.new(q, rf)
|
107
|
+
qf = QueryFilter.new(TermQuery.new(Term.new("switch", "on")))
|
108
|
+
do_test_top_docs(is, rq, [2,4,6], qf)
|
109
|
+
query = FilteredQuery.new(rq, qf)
|
110
|
+
rf2 = RangeFilter.new_more("int", "3")
|
111
|
+
do_test_top_docs(is, query, [4,6], rf2)
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class FuzzyQueryTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Document
|
5
|
+
include Ferret::Search
|
6
|
+
include Ferret::Store
|
7
|
+
include Ferret::Analysis
|
8
|
+
include Ferret::Index
|
9
|
+
|
10
|
+
def add_doc(text, writer)
|
11
|
+
doc = Document.new()
|
12
|
+
doc << Field.new("field", text, Field::Store::NO, Field::Index::TOKENIZED)
|
13
|
+
writer << doc
|
14
|
+
end
|
15
|
+
|
16
|
+
def setup()
|
17
|
+
@dir = RAMDirectory.new()
|
18
|
+
end
|
19
|
+
|
20
|
+
def tear_down()
|
21
|
+
@dir.close()
|
22
|
+
end
|
23
|
+
|
24
|
+
def do_test_top_docs(is, query, expected)
|
25
|
+
top_docs = is.search(query)
|
26
|
+
assert_equal(expected.length, top_docs.total_hits,
|
27
|
+
"expected #{expected.length} hits but got #{top_docs.total_hits}")
|
28
|
+
assert_equal(expected.length, top_docs.score_docs.size)
|
29
|
+
top_docs.total_hits.times do |i|
|
30
|
+
assert_equal(expected[i], top_docs.score_docs[i].doc)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def do_prefix_test(is, text, prefix, expected)
|
35
|
+
fq = FuzzyQuery.new(Term.new("field", text), FuzzyQuery.default_min_similarity, prefix)
|
36
|
+
#puts is.explain(fq, 0)
|
37
|
+
#puts is.explain(fq, 1)
|
38
|
+
do_test_top_docs(is, fq, expected)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_fuzziness()
|
42
|
+
iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
|
43
|
+
add_doc("aaaaa", iw)
|
44
|
+
add_doc("aaaab", iw)
|
45
|
+
add_doc("aaabb", iw)
|
46
|
+
add_doc("aabbb", iw)
|
47
|
+
add_doc("abbbb", iw)
|
48
|
+
add_doc("bbbbb", iw)
|
49
|
+
add_doc("ddddd", iw)
|
50
|
+
#iw.optimize()
|
51
|
+
iw.close()
|
52
|
+
|
53
|
+
|
54
|
+
is = IndexSearcher.new(@dir)
|
55
|
+
|
56
|
+
fq = FuzzyQuery.new(Term.new("field", "aaaaa"), FuzzyQuery.default_min_similarity, 5)
|
57
|
+
|
58
|
+
do_prefix_test(is, "aaaaa", 0, [0,1,2])
|
59
|
+
do_prefix_test(is, "aaaaa", 1, [0,1,2])
|
60
|
+
do_prefix_test(is, "aaaaa", 2, [0,1,2])
|
61
|
+
do_prefix_test(is, "aaaaa", 3, [0,1,2])
|
62
|
+
do_prefix_test(is, "aaaaa", 4, [0,1])
|
63
|
+
do_prefix_test(is, "aaaaa", 5, [0])
|
64
|
+
do_prefix_test(is, "aaaaa", 6, [0])
|
65
|
+
|
66
|
+
do_prefix_test(is, "xxxxx", 0, [])
|
67
|
+
|
68
|
+
do_prefix_test(is, "aaccc", 0, [])
|
69
|
+
|
70
|
+
do_prefix_test(is, "aaaac", 0, [0,1,2])
|
71
|
+
do_prefix_test(is, "aaaac", 1, [0,1,2])
|
72
|
+
do_prefix_test(is, "aaaac", 2, [0,1,2])
|
73
|
+
do_prefix_test(is, "aaaac", 3, [0,1,2])
|
74
|
+
do_prefix_test(is, "aaaac", 4, [0,1])
|
75
|
+
do_prefix_test(is, "aaaac", 5, [])
|
76
|
+
|
77
|
+
do_prefix_test(is, "ddddX", 0, [6])
|
78
|
+
do_prefix_test(is, "ddddX", 1, [6])
|
79
|
+
do_prefix_test(is, "ddddX", 2, [6])
|
80
|
+
do_prefix_test(is, "ddddX", 3, [6])
|
81
|
+
do_prefix_test(is, "ddddX", 4, [6])
|
82
|
+
do_prefix_test(is, "ddddX", 5, [])
|
83
|
+
|
84
|
+
fq = FuzzyQuery.new(Term.new("anotherfield", "ddddX"), FuzzyQuery.default_min_similarity, 0)
|
85
|
+
top_docs = is.search(fq)
|
86
|
+
assert_equal(0, top_docs.total_hits)
|
87
|
+
|
88
|
+
is.close()
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_fuzziness_long()
|
92
|
+
iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
|
93
|
+
add_doc("aaaaaaa", iw)
|
94
|
+
add_doc("segment", iw)
|
95
|
+
iw.optimize()
|
96
|
+
iw.close()
|
97
|
+
is = IndexSearcher.new(@dir)
|
98
|
+
|
99
|
+
# not similar enough:
|
100
|
+
do_prefix_test(is, "xxxxx", 0, [])
|
101
|
+
|
102
|
+
# edit distance to "aaaaaaa" = 3, this matches because the string is longer than
|
103
|
+
# in testDefaultFuzziness so a bigger difference is allowed:
|
104
|
+
do_prefix_test(is, "aaaaccc", 0, [0])
|
105
|
+
|
106
|
+
# now with prefix
|
107
|
+
do_prefix_test(is, "aaaaccc", 1, [0])
|
108
|
+
do_prefix_test(is, "aaaaccc", 4, [0])
|
109
|
+
do_prefix_test(is, "aaaaccc", 5, [])
|
110
|
+
|
111
|
+
# no match, more than half of the characters is wrong:
|
112
|
+
do_prefix_test(is, "aaacccc", 0, [])
|
113
|
+
|
114
|
+
# now with prefix
|
115
|
+
do_prefix_test(is, "aaacccc", 1, [])
|
116
|
+
|
117
|
+
# "student" and "stellent" are indeed similar to "segment" by default:
|
118
|
+
do_prefix_test(is, "student", 0, [1])
|
119
|
+
do_prefix_test(is, "stellent", 0, [1])
|
120
|
+
|
121
|
+
# now with prefix
|
122
|
+
do_prefix_test(is, "student", 2, [])
|
123
|
+
do_prefix_test(is, "stellent", 2, [])
|
124
|
+
|
125
|
+
# "student" doesn't match anymore thanks to increased minimum similarity:
|
126
|
+
fq = FuzzyQuery.new(Term.new("field", "student"), 0.6, 0)
|
127
|
+
top_docs = is.search(fq)
|
128
|
+
assert_equal(0, top_docs.total_hits)
|
129
|
+
|
130
|
+
assert_raise(ArgumentError) {fq = FuzzyQuery.new(Term.new("f", "s"), 1.1)}
|
131
|
+
assert_raise(ArgumentError) {fq = FuzzyQuery.new(Term.new("f", "s"), -0.1)}
|
132
|
+
|
133
|
+
is.close()
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
|
3
|
+
class IndexSearcherTest < Test::Unit::TestCase
|
4
|
+
include Ferret::Document
|
5
|
+
include Ferret::Search
|
6
|
+
include Ferret::Store
|
7
|
+
include Ferret::Analysis
|
8
|
+
include Ferret::Index
|
9
|
+
|
10
|
+
def setup()
|
11
|
+
@dir = RAMDirectory.new()
|
12
|
+
iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
|
13
|
+
@documents = IndexTestHelper.prepare_search_docs()
|
14
|
+
@documents.each { |doc| iw << doc; }
|
15
|
+
iw.close()
|
16
|
+
@is = IndexSearcher.new(@dir)
|
17
|
+
end
|
18
|
+
|
19
|
+
def tear_down()
|
20
|
+
@is.close
|
21
|
+
@dir.close()
|
22
|
+
end
|
23
|
+
|
24
|
+
def get_docs(score_docs)
|
25
|
+
docs = []
|
26
|
+
score_docs.each do |score_doc|
|
27
|
+
docs << score_doc.doc
|
28
|
+
end
|
29
|
+
docs
|
30
|
+
end
|
31
|
+
|
32
|
+
def check_hits(query, expected, top=nil, total_hits=nil)
|
33
|
+
top_docs = @is.search(query)
|
34
|
+
assert_equal(expected.length, top_docs.score_docs.size)
|
35
|
+
assert_equal(top, top_docs.score_docs[0].doc) if top
|
36
|
+
if total_hits
|
37
|
+
assert_equal(total_hits, top_docs.total_hits)
|
38
|
+
else
|
39
|
+
assert_equal(expected.length, top_docs.total_hits)
|
40
|
+
end
|
41
|
+
top_docs.score_docs.each do |score_doc|
|
42
|
+
assert(expected.include?(score_doc.doc),
|
43
|
+
"#{score_doc.doc} was found unexpectedly")
|
44
|
+
assert(score_doc.score =~ @is.explain(query, score_doc.doc).value,
|
45
|
+
"Scores(#{score_doc.score} != #{@is.explain(query, score_doc.doc).value})")
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_term_query
|
50
|
+
tq = TermQuery.new(Term.new("field", "word2"));
|
51
|
+
tq.boost = 100
|
52
|
+
check_hits(tq, [1,4,8])
|
53
|
+
|
54
|
+
tq = TermQuery.new(Term.new("field", "word1"));
|
55
|
+
top_docs = @is.search(tq)
|
56
|
+
#puts top_docs.score_docs
|
57
|
+
assert_equal(@documents.size, top_docs.total_hits)
|
58
|
+
assert_equal(10, top_docs.score_docs.size)
|
59
|
+
top_docs = @is.search(tq, {:num_docs => 20})
|
60
|
+
assert_equal(@documents.size, top_docs.score_docs.size)
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_boolean_query
|
64
|
+
bq = BooleanQuery.new()
|
65
|
+
tq1 = TermQuery.new(Term.new("field", "word1"))
|
66
|
+
tq2 = TermQuery.new(Term.new("field", "word3"))
|
67
|
+
bq.add_query(tq1, BooleanClause::Occur::MUST)
|
68
|
+
bq.add_query(tq2, BooleanClause::Occur::MUST)
|
69
|
+
check_hits(bq, [2,3,6,8,11,14], 14)
|
70
|
+
|
71
|
+
tq3 = TermQuery.new(Term.new("field", "word2"))
|
72
|
+
bq.add_query(tq3, BooleanClause::Occur::SHOULD)
|
73
|
+
check_hits(bq, [2,3,6,8,11,14], 8)
|
74
|
+
|
75
|
+
bq = BooleanQuery.new()
|
76
|
+
bq.add_query(tq2, BooleanClause::Occur::MUST)
|
77
|
+
bq.add_query(tq3, BooleanClause::Occur::MUST_NOT)
|
78
|
+
check_hits(bq, [2,3,6,11,14])
|
79
|
+
|
80
|
+
bq = BooleanQuery.new()
|
81
|
+
bq.add_query(tq2, BooleanClause::Occur::MUST_NOT)
|
82
|
+
check_hits(bq, [])
|
83
|
+
|
84
|
+
bq = BooleanQuery.new()
|
85
|
+
bq.add_query(tq2, BooleanClause::Occur::SHOULD)
|
86
|
+
bq.add_query(tq3, BooleanClause::Occur::SHOULD)
|
87
|
+
check_hits(bq, [1,2,3,4,6,8,11,14])
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_phrase_query()
|
91
|
+
pq = PhraseQuery.new()
|
92
|
+
t1 = Term.new("field", "quick")
|
93
|
+
t2 = Term.new("field", "brown")
|
94
|
+
t3 = Term.new("field", "fox")
|
95
|
+
pq << t1 << t2 << t3
|
96
|
+
check_hits(pq, [1])
|
97
|
+
|
98
|
+
pq.slop = 4
|
99
|
+
check_hits(pq, [1,16,17])
|
100
|
+
|
101
|
+
pq = PhraseQuery.new()
|
102
|
+
pq << t1
|
103
|
+
pq.add(t3, 2)
|
104
|
+
check_hits(pq, [1,11,14])
|
105
|
+
|
106
|
+
pq.slop = 1
|
107
|
+
check_hits(pq, [1,11,14,16])
|
108
|
+
|
109
|
+
pq.slop = 4
|
110
|
+
check_hits(pq, [1,11,14,16,17])
|
111
|
+
end
|
112
|
+
|
113
|
+
def test_range_query()
|
114
|
+
rq = RangeQuery.new("date", "20051006", "20051010", true, true)
|
115
|
+
check_hits(rq, [6,7,8,9,10])
|
116
|
+
|
117
|
+
rq = RangeQuery.new("date", "20051006", "20051010", false, true)
|
118
|
+
check_hits(rq, [7,8,9,10])
|
119
|
+
|
120
|
+
rq = RangeQuery.new("date", "20051006", "20051010", true, false)
|
121
|
+
check_hits(rq, [6,7,8,9])
|
122
|
+
|
123
|
+
rq = RangeQuery.new("date", "20051006", "20051010", false, false)
|
124
|
+
check_hits(rq, [7,8,9])
|
125
|
+
|
126
|
+
rq = RangeQuery.new("date", nil, "20051003", false, true)
|
127
|
+
check_hits(rq, [0,1,2,3])
|
128
|
+
|
129
|
+
rq = RangeQuery.new("date", nil, "20051003", false, false)
|
130
|
+
check_hits(rq, [0,1,2])
|
131
|
+
|
132
|
+
rq = RangeQuery.new_less("date", "20051003", true)
|
133
|
+
check_hits(rq, [0,1,2,3])
|
134
|
+
|
135
|
+
rq = RangeQuery.new_less("date", "20051003", false)
|
136
|
+
check_hits(rq, [0,1,2])
|
137
|
+
|
138
|
+
rq = RangeQuery.new("date", "20051014", nil, true, false)
|
139
|
+
check_hits(rq, [14,15,16,17])
|
140
|
+
|
141
|
+
rq = RangeQuery.new("date", "20051014", nil, false, false)
|
142
|
+
check_hits(rq, [15,16,17])
|
143
|
+
|
144
|
+
rq = RangeQuery.new_more("date", "20051014", true)
|
145
|
+
check_hits(rq, [14,15,16,17])
|
146
|
+
|
147
|
+
rq = RangeQuery.new_more("date", "20051014", false)
|
148
|
+
check_hits(rq, [15,16,17])
|
149
|
+
end
|
150
|
+
|
151
|
+
def test_prefix_query()
|
152
|
+
t = Term.new("cat", "cat1")
|
153
|
+
pq = PrefixQuery.new(t)
|
154
|
+
check_hits(pq, [0, 1, 2, 3, 4, 13, 14, 15, 16, 17])
|
155
|
+
|
156
|
+
t.text = "cat1/sub2"
|
157
|
+
pq = PrefixQuery.new(t)
|
158
|
+
check_hits(pq, [3, 4, 13, 15])
|
159
|
+
end
|
160
|
+
|
161
|
+
def test_wildcard_query()
|
162
|
+
t = Term.new("cat", "cat1*")
|
163
|
+
wq = WildcardQuery.new(t)
|
164
|
+
check_hits(wq, [0, 1, 2, 3, 4, 13, 14, 15, 16, 17])
|
165
|
+
|
166
|
+
t.text = "cat1*/su??ub2"
|
167
|
+
wq = WildcardQuery.new(t)
|
168
|
+
check_hits(wq, [4, 16])
|
169
|
+
end
|
170
|
+
|
171
|
+
def test_prefix_query()
|
172
|
+
t11 = Term.new("field", "quick")
|
173
|
+
t12 = Term.new("field", "fast")
|
174
|
+
t21 = Term.new("field", "brown")
|
175
|
+
t22 = Term.new("field", "red")
|
176
|
+
t23 = Term.new("field", "hairy")
|
177
|
+
t3 = Term.new("field", "fox")
|
178
|
+
|
179
|
+
mpq = MultiPhraseQuery.new()
|
180
|
+
mpq << [t11, t12]
|
181
|
+
mpq << [t21, t22, t23]
|
182
|
+
mpq << t3
|
183
|
+
check_hits(mpq, [1, 8, 11, 14])
|
184
|
+
|
185
|
+
mpq.slop = 4
|
186
|
+
check_hits(mpq, [1, 8, 11, 14, 16, 17])
|
187
|
+
end
|
188
|
+
end
|