ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,84 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class QueryParserTest < Test::Unit::TestCase
4
+
5
+
6
+ def setup()
7
+ @parser = Ferret::QueryParser.new("xxx")
8
+ end
9
+
10
+ def test_strings()
11
+ pairs = [
12
+ ['word', 'word'],
13
+ ['field:word', 'field:word'],
14
+ ['"word1 word2 word3"', '"word word word"'],
15
+ ['"word1 2342 word3"', '"word word"'],
16
+ ['field:"one two three"', 'field:"one two three"'],
17
+ ['field:"one 222 three"', 'field:"one three"'],
18
+ ['field:"one <> three"', 'field:"one <> three"'],
19
+ ['field:"one <> three <>"', 'field:"one <> three"'],
20
+ ['field:"one <> <> <> three <>"', 'field:"one <> <> <> three"'],
21
+ ['field:"one <> <> <> three|four|five <>"', 'field:"one <> <> <> three|four|five"'],
22
+ ['field:"one|two three|four|five six|seven"', 'field:"one|two three|four|five six|seven"'],
23
+ ['[aaa bbb]', '[aaa bbb]'],
24
+ ['{aaa bbb]', '{aaa bbb]'],
25
+ ['field:[aaa bbb}', 'field:[aaa bbb}'],
26
+ ['{aaa bbb}', '{aaa bbb}'],
27
+ ['{aaa|', '{aaa|'],
28
+ ['[aaa|', '[aaa|'],
29
+ ['field:|aaa}', 'field:|aaa}'],
30
+ ['|aaa]', '|aaa]'],
31
+ ['>aaa', '{aaa|'],
32
+ ['>=aaa', '[aaa|'],
33
+ ['<aaa', '|aaa}'],
34
+ ['field:<=aaa', 'field:|aaa]'],
35
+ ['REQ one REQ two', '+one +two'],
36
+ ['REQ one two', '+one two'],
37
+ ['one REQ two', 'one +two'],
38
+ ['+one +two', '+one +two'],
39
+ ['+one two', '+one two'],
40
+ ['one +two', 'one +two'],
41
+ ['-one -two', '-one -two'],
42
+ ['-one two', '-one two'],
43
+ ['one -two', 'one -two'],
44
+ ['!one !two', '-one -two'],
45
+ ['!one two', '-one two'],
46
+ ['one !two', 'one -two'],
47
+ ['NOT one NOT two', '-one -two'],
48
+ ['NOT one two', '-one two'],
49
+ ['one NOT two', 'one -two'],
50
+ ['one two', 'one two'],
51
+ ['one OR two', 'one two'],
52
+ ['one AND two', '+one +two'],
53
+ ['one two AND three', 'one two +three'],
54
+ ['one two OR three', 'one two three'],
55
+ ['one (two AND three)', 'one (+two +three)'],
56
+ ['one AND (two OR three)', '+one +(two three)'],
57
+ ['field:(one AND (two OR three))', '+field:one +(field:two field:three)'],
58
+ ['one AND (two OR [aaa vvv})', '+one +(two [aaa vvv})'],
59
+ ['one AND (one:two OR two:three) AND four', '+one +(one:two two:three) +four'],
60
+ ['one^1.23', 'one^1.23'],
61
+ ['(one AND two)^100.23', '(+one +two)^100.23'],
62
+ ['field:(one AND two)^100.23', '(+field:one +field:two)^100.23'],
63
+ ['field:(one AND [aaa bbb]^23.3)^100.23', '(+field:one +field:[aaa bbb]^23.3)^100.23'],
64
+ ['(REQ field:"one two three")^23', 'field:"one two three"^23.0'],
65
+ ['asdf~0.2', 'asdf~0.2'],
66
+ ['field:asdf~0.2', 'field:asdf~0.2'],
67
+ ['asdf~0.2^100.0', 'asdf~0.2^100.0'],
68
+ ['field:asdf~0.2^0.1', 'field:asdf~0.2^0.1'],
69
+ ['field:"asdf <> asdf|asdf"~4', 'field:"asdf <> asdf|asdf"~4'],
70
+ ['"one two three four five"~5', '"one two three four five"~5'],
71
+ ['ab?de', 'ab?de'],
72
+ ['ab*de', 'ab*de'],
73
+ ['asdf?*?asd*dsf?asfd*asdf?', 'asdf?*?asd*dsf?asfd*asdf?'],
74
+ ['field:a* AND field:(b*)', '+field:a* +field:b*'],
75
+ ['field:abc~ AND field:(b*)', '+field:abc~0.5 +field:b*'],
76
+ ['asdf?*?asd*dsf?asfd*asdf?^20.0', 'asdf?*?asd*dsf?asfd*asdf?^20.0']
77
+ ]
78
+
79
+
80
+ pairs.each do |pair|
81
+ assert_equal(pair[1], @parser.parse(pair[0]).to_s(@parser.default_field))
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,113 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class FilterTest < Test::Unit::TestCase
5
+ include Ferret::Document
6
+ include Ferret::Search
7
+ include Ferret::Analysis
8
+ include Ferret::Index
9
+
10
+ def add_doc(hash, writer)
11
+ doc = Document.new()
12
+ hash.each_pair do |field, text|
13
+ doc << Field.new(field, text, Field::Store::NO, Field::Index::UNTOKENIZED)
14
+ end
15
+ writer << doc
16
+ end
17
+
18
+ def setup()
19
+ @dir = Ferret::Store::RAMDirectory.new()
20
+ iw = IndexWriter.new(@dir,
21
+ :analyzer => WhiteSpaceAnalyzer.new(),
22
+ :create => true)
23
+ docs = [
24
+ {"int"=>"0","date"=>"20040601","switch"=>"on"},
25
+ {"int"=>"1","date"=>"20041001","switch"=>"off"},
26
+ {"int"=>"2","date"=>"20051101","switch"=>"on"},
27
+ {"int"=>"3","date"=>"20041201","switch"=>"off"},
28
+ {"int"=>"4","date"=>"20051101","switch"=>"on"},
29
+ {"int"=>"5","date"=>"20041201","switch"=>"off"},
30
+ {"int"=>"6","date"=>"20050101","switch"=>"on"},
31
+ {"int"=>"7","date"=>"20040701","switch"=>"off"},
32
+ {"int"=>"8","date"=>"20050301","switch"=>"on"},
33
+ {"int"=>"9","date"=>"20050401","switch"=>"off"}
34
+ ]
35
+ docs.each {|doc| add_doc(doc, iw)}
36
+ iw.close
37
+ end
38
+
39
+ def tear_down()
40
+ @dir.close()
41
+ end
42
+
43
+ def do_test_top_docs(is, query, expected, filter)
44
+ top_docs = is.search(query, {:filter => filter})
45
+ #puts top_docs
46
+ assert_equal(expected.size, top_docs.score_docs.size)
47
+ top_docs.total_hits.times do |i|
48
+ assert_equal(expected[i], top_docs.score_docs[i].doc)
49
+ end
50
+ end
51
+
52
+ def test_range_filter
53
+ is = IndexSearcher.new(@dir)
54
+ q = MatchAllDocsQuery.new()
55
+ rf = RangeFilter.new("int", "2", "6", true, true)
56
+ do_test_top_docs(is, q, [2,3,4,5,6], rf)
57
+ rf = RangeFilter.new("int", "2", "6", true, false)
58
+ do_test_top_docs(is, q, [2,3,4,5], rf)
59
+ rf = RangeFilter.new("int", "2", "6", false, true)
60
+ do_test_top_docs(is, q, [3,4,5,6], rf)
61
+ rf = RangeFilter.new("int", "2", "6", false, false)
62
+ do_test_top_docs(is, q, [3,4,5], rf)
63
+ rf = RangeFilter.new_more("int", "6")
64
+ do_test_top_docs(is, q, [6,7,8,9], rf)
65
+ rf = RangeFilter.new_more("int", "6", false)
66
+ do_test_top_docs(is, q, [7,8,9], rf)
67
+ rf = RangeFilter.new_less("int", "2")
68
+ do_test_top_docs(is, q, [0,1,2], rf)
69
+ rf = RangeFilter.new_less("int", "2", false)
70
+ do_test_top_docs(is, q, [0,1], rf)
71
+ end
72
+
73
+ def test_range_filter_errors
74
+ assert_raise(ArgumentError) {f = RangeFilter.new("", "asd", nil, false, true)}
75
+ assert_raise(ArgumentError) {f = RangeFilter.new("", nil, "asd", true, false)}
76
+ assert_raise(ArgumentError) {f = RangeFilter.new("", "ac", "ab", false, false)}
77
+ assert_raise(ArgumentError) {f = RangeFilter.new("", nil, nil, false, false)}
78
+ end
79
+
80
+ def test_query_filter()
81
+ is = IndexSearcher.new(@dir)
82
+ q = MatchAllDocsQuery.new()
83
+ qf = QueryFilter.new(TermQuery.new(Term.new("switch", "on")))
84
+ do_test_top_docs(is, q, [0,2,4,6,8], qf)
85
+ # test again to test caching doesn't break it
86
+ do_test_top_docs(is, q, [0,2,4,6,8], qf)
87
+ qf = QueryFilter.new(TermQuery.new(Term.new("switch", "off")))
88
+ do_test_top_docs(is, q, [1,3,5,7,9], qf)
89
+ end
90
+
91
+ def test_caching_wrapper_filter
92
+ is = IndexSearcher.new(@dir)
93
+ q = MatchAllDocsQuery.new()
94
+ rf = RangeFilter.new("int", "2", "6", true, true)
95
+ cf = CachingWrapperFilter.new(rf)
96
+ #puts "about to test cache"
97
+ do_test_top_docs(is, q, [2,3,4,5,6], cf)
98
+ do_test_top_docs(is, q, [2,3,4,5,6], cf)
99
+ #puts "finished_testing_cache"
100
+ end
101
+
102
+ def test_filtered_query
103
+ is = IndexSearcher.new(@dir)
104
+ q = MatchAllDocsQuery.new()
105
+ rf = RangeFilter.new("int", "2", "6", true, true)
106
+ rq = FilteredQuery.new(q, rf)
107
+ qf = QueryFilter.new(TermQuery.new(Term.new("switch", "on")))
108
+ do_test_top_docs(is, rq, [2,4,6], qf)
109
+ query = FilteredQuery.new(rq, qf)
110
+ rf2 = RangeFilter.new_more("int", "3")
111
+ do_test_top_docs(is, query, [4,6], rf2)
112
+ end
113
+ end
@@ -0,0 +1,136 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class FuzzyQueryTest < Test::Unit::TestCase
4
+ include Ferret::Document
5
+ include Ferret::Search
6
+ include Ferret::Store
7
+ include Ferret::Analysis
8
+ include Ferret::Index
9
+
10
+ def add_doc(text, writer)
11
+ doc = Document.new()
12
+ doc << Field.new("field", text, Field::Store::NO, Field::Index::TOKENIZED)
13
+ writer << doc
14
+ end
15
+
16
+ def setup()
17
+ @dir = RAMDirectory.new()
18
+ end
19
+
20
+ def tear_down()
21
+ @dir.close()
22
+ end
23
+
24
+ def do_test_top_docs(is, query, expected)
25
+ top_docs = is.search(query)
26
+ assert_equal(expected.length, top_docs.total_hits,
27
+ "expected #{expected.length} hits but got #{top_docs.total_hits}")
28
+ assert_equal(expected.length, top_docs.score_docs.size)
29
+ top_docs.total_hits.times do |i|
30
+ assert_equal(expected[i], top_docs.score_docs[i].doc)
31
+ end
32
+ end
33
+
34
+ def do_prefix_test(is, text, prefix, expected)
35
+ fq = FuzzyQuery.new(Term.new("field", text), FuzzyQuery.default_min_similarity, prefix)
36
+ #puts is.explain(fq, 0)
37
+ #puts is.explain(fq, 1)
38
+ do_test_top_docs(is, fq, expected)
39
+ end
40
+
41
+ def test_fuzziness()
42
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
43
+ add_doc("aaaaa", iw)
44
+ add_doc("aaaab", iw)
45
+ add_doc("aaabb", iw)
46
+ add_doc("aabbb", iw)
47
+ add_doc("abbbb", iw)
48
+ add_doc("bbbbb", iw)
49
+ add_doc("ddddd", iw)
50
+ #iw.optimize()
51
+ iw.close()
52
+
53
+
54
+ is = IndexSearcher.new(@dir)
55
+
56
+ fq = FuzzyQuery.new(Term.new("field", "aaaaa"), FuzzyQuery.default_min_similarity, 5)
57
+
58
+ do_prefix_test(is, "aaaaa", 0, [0,1,2])
59
+ do_prefix_test(is, "aaaaa", 1, [0,1,2])
60
+ do_prefix_test(is, "aaaaa", 2, [0,1,2])
61
+ do_prefix_test(is, "aaaaa", 3, [0,1,2])
62
+ do_prefix_test(is, "aaaaa", 4, [0,1])
63
+ do_prefix_test(is, "aaaaa", 5, [0])
64
+ do_prefix_test(is, "aaaaa", 6, [0])
65
+
66
+ do_prefix_test(is, "xxxxx", 0, [])
67
+
68
+ do_prefix_test(is, "aaccc", 0, [])
69
+
70
+ do_prefix_test(is, "aaaac", 0, [0,1,2])
71
+ do_prefix_test(is, "aaaac", 1, [0,1,2])
72
+ do_prefix_test(is, "aaaac", 2, [0,1,2])
73
+ do_prefix_test(is, "aaaac", 3, [0,1,2])
74
+ do_prefix_test(is, "aaaac", 4, [0,1])
75
+ do_prefix_test(is, "aaaac", 5, [])
76
+
77
+ do_prefix_test(is, "ddddX", 0, [6])
78
+ do_prefix_test(is, "ddddX", 1, [6])
79
+ do_prefix_test(is, "ddddX", 2, [6])
80
+ do_prefix_test(is, "ddddX", 3, [6])
81
+ do_prefix_test(is, "ddddX", 4, [6])
82
+ do_prefix_test(is, "ddddX", 5, [])
83
+
84
+ fq = FuzzyQuery.new(Term.new("anotherfield", "ddddX"), FuzzyQuery.default_min_similarity, 0)
85
+ top_docs = is.search(fq)
86
+ assert_equal(0, top_docs.total_hits)
87
+
88
+ is.close()
89
+ end
90
+
91
+ def test_fuzziness_long()
92
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
93
+ add_doc("aaaaaaa", iw)
94
+ add_doc("segment", iw)
95
+ iw.optimize()
96
+ iw.close()
97
+ is = IndexSearcher.new(@dir)
98
+
99
+ # not similar enough:
100
+ do_prefix_test(is, "xxxxx", 0, [])
101
+
102
+ # edit distance to "aaaaaaa" = 3, this matches because the string is longer than
103
+ # in testDefaultFuzziness so a bigger difference is allowed:
104
+ do_prefix_test(is, "aaaaccc", 0, [0])
105
+
106
+ # now with prefix
107
+ do_prefix_test(is, "aaaaccc", 1, [0])
108
+ do_prefix_test(is, "aaaaccc", 4, [0])
109
+ do_prefix_test(is, "aaaaccc", 5, [])
110
+
111
+ # no match, more than half of the characters is wrong:
112
+ do_prefix_test(is, "aaacccc", 0, [])
113
+
114
+ # now with prefix
115
+ do_prefix_test(is, "aaacccc", 1, [])
116
+
117
+ # "student" and "stellent" are indeed similar to "segment" by default:
118
+ do_prefix_test(is, "student", 0, [1])
119
+ do_prefix_test(is, "stellent", 0, [1])
120
+
121
+ # now with prefix
122
+ do_prefix_test(is, "student", 2, [])
123
+ do_prefix_test(is, "stellent", 2, [])
124
+
125
+ # "student" doesn't match anymore thanks to increased minimum similarity:
126
+ fq = FuzzyQuery.new(Term.new("field", "student"), 0.6, 0)
127
+ top_docs = is.search(fq)
128
+ assert_equal(0, top_docs.total_hits)
129
+
130
+ assert_raise(ArgumentError) {fq = FuzzyQuery.new(Term.new("f", "s"), 1.1)}
131
+ assert_raise(ArgumentError) {fq = FuzzyQuery.new(Term.new("f", "s"), -0.1)}
132
+
133
+ is.close()
134
+ end
135
+
136
+ end
@@ -0,0 +1,188 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class IndexSearcherTest < Test::Unit::TestCase
4
+ include Ferret::Document
5
+ include Ferret::Search
6
+ include Ferret::Store
7
+ include Ferret::Analysis
8
+ include Ferret::Index
9
+
10
+ def setup()
11
+ @dir = RAMDirectory.new()
12
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
13
+ @documents = IndexTestHelper.prepare_search_docs()
14
+ @documents.each { |doc| iw << doc; }
15
+ iw.close()
16
+ @is = IndexSearcher.new(@dir)
17
+ end
18
+
19
+ def tear_down()
20
+ @is.close
21
+ @dir.close()
22
+ end
23
+
24
+ def get_docs(score_docs)
25
+ docs = []
26
+ score_docs.each do |score_doc|
27
+ docs << score_doc.doc
28
+ end
29
+ docs
30
+ end
31
+
32
+ def check_hits(query, expected, top=nil, total_hits=nil)
33
+ top_docs = @is.search(query)
34
+ assert_equal(expected.length, top_docs.score_docs.size)
35
+ assert_equal(top, top_docs.score_docs[0].doc) if top
36
+ if total_hits
37
+ assert_equal(total_hits, top_docs.total_hits)
38
+ else
39
+ assert_equal(expected.length, top_docs.total_hits)
40
+ end
41
+ top_docs.score_docs.each do |score_doc|
42
+ assert(expected.include?(score_doc.doc),
43
+ "#{score_doc.doc} was found unexpectedly")
44
+ assert(score_doc.score =~ @is.explain(query, score_doc.doc).value,
45
+ "Scores(#{score_doc.score} != #{@is.explain(query, score_doc.doc).value})")
46
+ end
47
+ end
48
+
49
+ def test_term_query
50
+ tq = TermQuery.new(Term.new("field", "word2"));
51
+ tq.boost = 100
52
+ check_hits(tq, [1,4,8])
53
+
54
+ tq = TermQuery.new(Term.new("field", "word1"));
55
+ top_docs = @is.search(tq)
56
+ #puts top_docs.score_docs
57
+ assert_equal(@documents.size, top_docs.total_hits)
58
+ assert_equal(10, top_docs.score_docs.size)
59
+ top_docs = @is.search(tq, {:num_docs => 20})
60
+ assert_equal(@documents.size, top_docs.score_docs.size)
61
+ end
62
+
63
+ def test_boolean_query
64
+ bq = BooleanQuery.new()
65
+ tq1 = TermQuery.new(Term.new("field", "word1"))
66
+ tq2 = TermQuery.new(Term.new("field", "word3"))
67
+ bq.add_query(tq1, BooleanClause::Occur::MUST)
68
+ bq.add_query(tq2, BooleanClause::Occur::MUST)
69
+ check_hits(bq, [2,3,6,8,11,14], 14)
70
+
71
+ tq3 = TermQuery.new(Term.new("field", "word2"))
72
+ bq.add_query(tq3, BooleanClause::Occur::SHOULD)
73
+ check_hits(bq, [2,3,6,8,11,14], 8)
74
+
75
+ bq = BooleanQuery.new()
76
+ bq.add_query(tq2, BooleanClause::Occur::MUST)
77
+ bq.add_query(tq3, BooleanClause::Occur::MUST_NOT)
78
+ check_hits(bq, [2,3,6,11,14])
79
+
80
+ bq = BooleanQuery.new()
81
+ bq.add_query(tq2, BooleanClause::Occur::MUST_NOT)
82
+ check_hits(bq, [])
83
+
84
+ bq = BooleanQuery.new()
85
+ bq.add_query(tq2, BooleanClause::Occur::SHOULD)
86
+ bq.add_query(tq3, BooleanClause::Occur::SHOULD)
87
+ check_hits(bq, [1,2,3,4,6,8,11,14])
88
+ end
89
+
90
+ def test_phrase_query()
91
+ pq = PhraseQuery.new()
92
+ t1 = Term.new("field", "quick")
93
+ t2 = Term.new("field", "brown")
94
+ t3 = Term.new("field", "fox")
95
+ pq << t1 << t2 << t3
96
+ check_hits(pq, [1])
97
+
98
+ pq.slop = 4
99
+ check_hits(pq, [1,16,17])
100
+
101
+ pq = PhraseQuery.new()
102
+ pq << t1
103
+ pq.add(t3, 2)
104
+ check_hits(pq, [1,11,14])
105
+
106
+ pq.slop = 1
107
+ check_hits(pq, [1,11,14,16])
108
+
109
+ pq.slop = 4
110
+ check_hits(pq, [1,11,14,16,17])
111
+ end
112
+
113
+ def test_range_query()
114
+ rq = RangeQuery.new("date", "20051006", "20051010", true, true)
115
+ check_hits(rq, [6,7,8,9,10])
116
+
117
+ rq = RangeQuery.new("date", "20051006", "20051010", false, true)
118
+ check_hits(rq, [7,8,9,10])
119
+
120
+ rq = RangeQuery.new("date", "20051006", "20051010", true, false)
121
+ check_hits(rq, [6,7,8,9])
122
+
123
+ rq = RangeQuery.new("date", "20051006", "20051010", false, false)
124
+ check_hits(rq, [7,8,9])
125
+
126
+ rq = RangeQuery.new("date", nil, "20051003", false, true)
127
+ check_hits(rq, [0,1,2,3])
128
+
129
+ rq = RangeQuery.new("date", nil, "20051003", false, false)
130
+ check_hits(rq, [0,1,2])
131
+
132
+ rq = RangeQuery.new_less("date", "20051003", true)
133
+ check_hits(rq, [0,1,2,3])
134
+
135
+ rq = RangeQuery.new_less("date", "20051003", false)
136
+ check_hits(rq, [0,1,2])
137
+
138
+ rq = RangeQuery.new("date", "20051014", nil, true, false)
139
+ check_hits(rq, [14,15,16,17])
140
+
141
+ rq = RangeQuery.new("date", "20051014", nil, false, false)
142
+ check_hits(rq, [15,16,17])
143
+
144
+ rq = RangeQuery.new_more("date", "20051014", true)
145
+ check_hits(rq, [14,15,16,17])
146
+
147
+ rq = RangeQuery.new_more("date", "20051014", false)
148
+ check_hits(rq, [15,16,17])
149
+ end
150
+
151
+ def test_prefix_query()
152
+ t = Term.new("cat", "cat1")
153
+ pq = PrefixQuery.new(t)
154
+ check_hits(pq, [0, 1, 2, 3, 4, 13, 14, 15, 16, 17])
155
+
156
+ t.text = "cat1/sub2"
157
+ pq = PrefixQuery.new(t)
158
+ check_hits(pq, [3, 4, 13, 15])
159
+ end
160
+
161
+ def test_wildcard_query()
162
+ t = Term.new("cat", "cat1*")
163
+ wq = WildcardQuery.new(t)
164
+ check_hits(wq, [0, 1, 2, 3, 4, 13, 14, 15, 16, 17])
165
+
166
+ t.text = "cat1*/su??ub2"
167
+ wq = WildcardQuery.new(t)
168
+ check_hits(wq, [4, 16])
169
+ end
170
+
171
+ def test_prefix_query()
172
+ t11 = Term.new("field", "quick")
173
+ t12 = Term.new("field", "fast")
174
+ t21 = Term.new("field", "brown")
175
+ t22 = Term.new("field", "red")
176
+ t23 = Term.new("field", "hairy")
177
+ t3 = Term.new("field", "fox")
178
+
179
+ mpq = MultiPhraseQuery.new()
180
+ mpq << [t11, t12]
181
+ mpq << [t21, t22, t23]
182
+ mpq << t3
183
+ check_hits(mpq, [1, 8, 11, 14])
184
+
185
+ mpq.slop = 4
186
+ check_hits(mpq, [1, 8, 11, 14, 16, 17])
187
+ end
188
+ end