ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,140 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class IndexTest < Test::Unit::TestCase
5
+ include Ferret::Index
6
+ include Ferret::Analysis
7
+
8
+ def setup()
9
+ @qp = Ferret::QueryParser.new()
10
+ end
11
+
12
+ def tear_down()
13
+ end
14
+
15
+ def check_results(index, query, expected)
16
+ cnt = 0
17
+ index.search_each(query) do |doc, score|
18
+ assert(expected.index(doc))
19
+ cnt += 1
20
+ end
21
+ assert_equal(expected.length, cnt)
22
+ end
23
+
24
+ def do_test_index_with_array(index)
25
+ data = [
26
+ ["one two"],
27
+ ["one", "three"],
28
+ ["two"],
29
+ ["one", "four"],
30
+ ["one two"],
31
+ ["two", "three", "four"],
32
+ ["one"],
33
+ ["two", "three", "four", "five"]
34
+ ]
35
+ data.each {|doc| index << doc }
36
+ assert_equal(8, index.size)
37
+ q = "one"
38
+ check_results(index, q, [0, 1, 3, 4, 6])
39
+ q = "one AND two"
40
+ check_results(index, q, [0, 4])
41
+ q = "one OR five"
42
+ check_results(index, q, [0, 1, 3, 4, 6, 7])
43
+ assert_equal("two three four five", index.doc(7)["def_field"])
44
+ end
45
+
46
+ def do_test_index_with_hash(index)
47
+ data = [
48
+ {"def_field" => "one two"},
49
+ {"def_field" => "one", "field2" => "three"},
50
+ {"def_field" => "two"},
51
+ {"def_field" => "one", "field2" => "four"},
52
+ {"def_field" => "one two"},
53
+ {"def_field" => "two", "field2" => "three", "field3" => "four"},
54
+ {"def_field" => "one"},
55
+ {"def_field" => "two", "field2" => "three", "field3" => "five"}
56
+ ]
57
+ data.each {|doc| index << doc }
58
+ q = "one AND two"
59
+ check_results(index, q, [0, 4])
60
+ q = "one OR five"
61
+ check_results(index, q, [0, 1, 3, 4, 6])
62
+ q = "one OR field3:five"
63
+ check_results(index, q, [0, 1, 3, 4, 6, 7])
64
+ assert_equal("four", index[5]["field3"])
65
+ q = "field3:f*"
66
+ check_results(index, q, [5, 7])
67
+ q = "two AND field3:f*"
68
+ check_results(index, q, [5, 7])
69
+ assert_equal("five", index.doc(7)["field3"])
70
+ assert_equal("two", index.doc(7)["def_field"])
71
+ end
72
+
73
+ def do_test_index_with_doc_array(index)
74
+ data = [
75
+ {"def_field" => "one two", :id => "me"},
76
+ {"def_field" => "one", :field2 => "three"},
77
+ {"def_field" => "two"},
78
+ {"def_field" => "one", :field2 => "four"},
79
+ {"def_field" => "one two"},
80
+ {"def_field" => "two", :field2 => "three", "field3" => "four"},
81
+ {"def_field" => "one"},
82
+ {"def_field" => "two", :field2 => "three", "field3" => "five"}
83
+ ]
84
+ data.each {|doc| index << doc }
85
+ q = "one AND two"
86
+ check_results(index, q, [0, 4])
87
+ q = "one OR five"
88
+ check_results(index, q, [0, 1, 3, 4, 6])
89
+ q = "one OR field3:five"
90
+ check_results(index, q, [0, 1, 3, 4, 6, 7])
91
+ q = "two AND (field3:f*)"
92
+ check_results(index, q, [5, 7])
93
+ doc = index[5]
94
+ assert_equal("three", index[5]["field2"])
95
+ assert(!index.has_deletions?)
96
+ assert(!index.deleted?(5))
97
+ assert_equal(8, index.size)
98
+ index.delete(5)
99
+ assert(index.has_deletions?)
100
+ assert(index.deleted?(5))
101
+ assert_equal(7, index.size)
102
+ check_results(index, q, [7])
103
+ doc["field2"] = "dave"
104
+ index << doc
105
+ check_results(index, q, [6, 7])
106
+ assert_equal(8, index.size)
107
+ assert_equal("dave", index[7]["field2"])
108
+ index.optimize
109
+ check_results(index, q, [6, 7])
110
+ t = Term.new("field2", "three")
111
+ index.delete(t)
112
+ assert(index.deleted?(1))
113
+ assert(index.deleted?(6))
114
+ assert(! index.deleted?(7))
115
+ t = Term.new("field2", "four")
116
+ assert_equal("one", index[t]["def_field"])
117
+ assert_equal("one two", index["me"]["def_field"])
118
+ index.delete("me")
119
+ assert(index.deleted?(0))
120
+ end
121
+
122
+ def test_ram_index
123
+ index = Index.new(:default_field => "def_field")
124
+ do_test_index_with_array(index)
125
+ index = Index.new(:default_field => "def_field")
126
+ do_test_index_with_hash(index)
127
+ index = Index.new(:default_field => "def_field")
128
+ do_test_index_with_doc_array(index)
129
+ end
130
+
131
+ def test_fs_index
132
+ fs_path = File.join(File.dirname(__FILE__), '../../temp/fsdir')
133
+ index = Index.new(:path => fs_path, :create => true, :default_field => "def_field")
134
+ do_test_index_with_array(index)
135
+ index = Index.new(:path => fs_path, :create => true, :default_field => "def_field")
136
+ do_test_index_with_hash(index)
137
+ index = Index.new(:create => true, :default_field => "def_field")
138
+ do_test_index_with_doc_array(index)
139
+ end
140
+ end
@@ -0,0 +1,622 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ module IndexReaderCommon
4
+
5
+ include Ferret::Index
6
+ include Ferret::Analysis
7
+
8
+ def test_index_reader
9
+
10
+ do_test_term_doc_enum()
11
+
12
+ do_test_term_vectors()
13
+
14
+ do_test_changing_field()
15
+
16
+ do_test_get_doc()
17
+
18
+ end
19
+
20
+ def do_test_term_doc_enum()
21
+
22
+ assert_equal(IndexTestHelper::IR_TEST_DOC_CNT, @ir.num_docs())
23
+ assert_equal(IndexTestHelper::IR_TEST_DOC_CNT, @ir.max_doc())
24
+
25
+ term = Term.new("body", "Wally")
26
+ assert_equal(4, @ir.doc_freq(term))
27
+
28
+ tde = @ir.term_docs_for(term)
29
+
30
+ assert(tde.next?)
31
+ assert_equal(0, tde.doc())
32
+ assert_equal(1, tde.freq())
33
+ assert(tde.next?)
34
+ assert_equal(5, tde.doc())
35
+ assert_equal(1, tde.freq())
36
+ assert(tde.next?)
37
+ assert_equal(18, tde.doc())
38
+ assert_equal(3, tde.freq())
39
+ assert(tde.next?)
40
+ assert_equal(20, tde.doc())
41
+ assert_equal(6, tde.freq())
42
+ assert_equal(false, tde.next?)
43
+
44
+ # test fast read. Use a small array to exercise repeat read
45
+ docs = Array.new(3)
46
+ freqs = Array.new(3)
47
+
48
+ term = Term.new("body", "read")
49
+ tde.seek(term)
50
+ assert_equal(3, tde.read(docs, freqs))
51
+ assert_equal([1,2,6], docs)
52
+ assert_equal([1,2,4], freqs)
53
+
54
+ assert_equal(3, tde.read(docs, freqs))
55
+ assert_equal([9, 10, 15], docs)
56
+ assert_equal([3, 1, 1], freqs)
57
+
58
+ assert_equal(3, tde.read(docs, freqs))
59
+ assert_equal([16, 17, 20], docs)
60
+ assert_equal([2, 1, 1], freqs)
61
+
62
+ assert_equal(1, tde.read(docs, freqs))
63
+ assert_equal([21], docs[0, 1])
64
+ assert_equal([6], freqs[0, 1])
65
+
66
+ assert_equal(0, tde.read(docs, freqs))
67
+
68
+ do_test_term_docpos_enum_skip_to(tde)
69
+ tde.close()
70
+
71
+ # test term positions
72
+ term = Term.new("body", "read")
73
+ tde = @ir.term_positions_for(term)
74
+ assert(tde.next?)
75
+ assert_equal(1, tde.doc())
76
+ assert_equal(1, tde.freq())
77
+ assert_equal(3, tde.next_position())
78
+
79
+ assert(tde.next?)
80
+ assert_equal(2, tde.doc())
81
+ assert_equal(2, tde.freq())
82
+ assert_equal(1, tde.next_position())
83
+ assert_equal(4, tde.next_position())
84
+
85
+ assert(tde.next?)
86
+ assert_equal(6, tde.doc())
87
+ assert_equal(4, tde.freq())
88
+ assert_equal(3, tde.next_position())
89
+ assert_equal(4, tde.next_position())
90
+
91
+ assert(tde.next?)
92
+ assert_equal(9, tde.doc())
93
+ assert_equal(3, tde.freq())
94
+ assert_equal(0, tde.next_position())
95
+ assert_equal(4, tde.next_position())
96
+
97
+ assert(tde.skip_to(16))
98
+ assert_equal(16, tde.doc())
99
+ assert_equal(2, tde.freq())
100
+ assert_equal(2, tde.next_position())
101
+
102
+ assert(tde.skip_to(21))
103
+ assert_equal(21, tde.doc())
104
+ assert_equal(6, tde.freq())
105
+ assert_equal(3, tde.next_position())
106
+ assert_equal(4, tde.next_position())
107
+ assert_equal(5, tde.next_position())
108
+ assert_equal(8, tde.next_position())
109
+ assert_equal(9, tde.next_position())
110
+ assert_equal(10, tde.next_position())
111
+
112
+ assert_equal(false, tde.next?)
113
+
114
+ do_test_term_docpos_enum_skip_to(tde)
115
+ tde.close()
116
+ end
117
+
118
+ def do_test_term_docpos_enum_skip_to(tde)
119
+ term = Term.new("text", "skip")
120
+ tde.seek(term)
121
+
122
+ assert(tde.skip_to(10))
123
+ assert_equal(22, tde.doc())
124
+ assert_equal(22, tde.freq())
125
+
126
+ assert(tde.skip_to(60))
127
+ assert_equal(60, tde.doc())
128
+ assert_equal(60, tde.freq())
129
+
130
+ tde.seek(term)
131
+ assert(tde.skip_to(45))
132
+ assert_equal(45, tde.doc())
133
+ assert_equal(45, tde.freq())
134
+
135
+ assert(tde.skip_to(62))
136
+ assert_equal(62, tde.doc())
137
+ assert_equal(62, tde.freq())
138
+
139
+ assert(tde.skip_to(63))
140
+ assert_equal(63, tde.doc())
141
+ assert_equal(63, tde.freq())
142
+
143
+ assert_equal(false, tde.skip_to(64))
144
+
145
+ tde.seek(term)
146
+ assert_equal(false, tde.skip_to(64))
147
+ end
148
+
149
+ def t(start_offset, end_offset)
150
+ TermVectorOffsetInfo.new(start_offset, end_offset)
151
+ end
152
+
153
+ def do_test_term_vectors()
154
+ tv = @ir.get_term_vector(3, "body")
155
+
156
+ assert_equal("body", tv.field)
157
+ assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
158
+ assert_equal([3, 1, 4, 2], tv.term_frequencies)
159
+ assert_equal([[2, 4, 7], [3], [0, 5, 8, 9], [1,6]], tv.positions)
160
+ assert_equal([[t(12,17), t(24,29), t(42,47)],
161
+ [t(18,23)],
162
+ [t(0,5), t(30,35), t(48,53), t(54,59)],
163
+ [t(6,11), t(36,41)]], tv.offsets)
164
+ tv = nil
165
+
166
+ tvs = @ir.get_term_vectors(3)
167
+ assert_equal(3, tvs.size)
168
+ tv = tvs[0]
169
+ assert_equal("author", tv.field)
170
+ assert_equal(["Leo", "Tolstoy"], tv.terms)
171
+ assert(tv.offsets.nil?)
172
+ tv = tvs[1]
173
+ assert_equal("body", tv.field)
174
+ assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
175
+ tv = tvs[2]
176
+ assert_equal("title", tv.field)
177
+ assert_equal(["War And Peace"], tv.terms)
178
+ assert(tv.positions.nil?)
179
+ assert_equal(t(0, 13), tv.offsets[0][0])
180
+ end
181
+
182
+ def do_test_changing_field()
183
+ tv = @ir.get_term_vector(0, "changing_field")
184
+ assert(tv.nil?)
185
+
186
+ tv = @ir.get_term_vector(10, "changing_field")
187
+ assert(tv.positions.nil?)
188
+ assert(tv.offsets.nil?)
189
+
190
+ tv = @ir.get_term_vector(17, "changing_field")
191
+ assert(tv.positions)
192
+ assert(tv.offsets.nil?)
193
+
194
+ tv = @ir.get_term_vector(19, "changing_field")
195
+ assert(tv.positions.nil?)
196
+ assert(tv.offsets)
197
+
198
+ tv = @ir.get_term_vector(20, "changing_field")
199
+ assert(tv.positions)
200
+ assert(tv.offsets)
201
+
202
+ tv = @ir.get_term_vector(21, "changing_field")
203
+ assert(tv.nil?)
204
+ end
205
+
206
+ def do_test_get_doc()
207
+ doc = @ir.get_document(3)
208
+ assert_equal(4, doc.field_count)
209
+
210
+ df = doc.field("author")
211
+ assert_equal("author", df.name)
212
+ assert_equal("Leo Tolstoy", df.data)
213
+ assert_equal(df.boost, 1.0)
214
+ assert_equal(true, df.stored?)
215
+ assert_equal(false, df.compressed?)
216
+ assert_equal(true, df.indexed?)
217
+ assert_equal(true, df.tokenized?)
218
+ assert_equal(true, df.store_term_vector?)
219
+ assert_equal(true, df.store_positions?)
220
+ assert_equal(false, df.store_offsets?)
221
+ assert_equal(false, df.binary?)
222
+
223
+ df = doc.field("body")
224
+ assert_equal("body", df.name)
225
+ assert_equal("word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", df.data)
226
+ assert_equal(df.boost, 1.0)
227
+ assert_equal(true, df.stored?)
228
+ assert_equal(false, df.compressed?)
229
+ assert_equal(true, df.indexed?)
230
+ assert_equal(true, df.tokenized?)
231
+ assert_equal(true, df.store_term_vector?)
232
+ assert_equal(true, df.store_positions?)
233
+ assert_equal(true, df.store_offsets?)
234
+ assert_equal(false, df.binary?)
235
+
236
+ df = doc.field("title")
237
+ assert_equal("title", df.name)
238
+ assert_equal("War And Peace", df.data)
239
+ assert_equal(df.boost, 1.0)
240
+ assert_equal(true, df.stored?)
241
+ assert_equal(false, df.compressed?)
242
+ assert_equal(true, df.indexed?)
243
+ assert_equal(false, df.tokenized?)
244
+ assert_equal(true, df.store_term_vector?)
245
+ assert_equal(false, df.store_positions?)
246
+ assert_equal(true, df.store_offsets?)
247
+ assert_equal(false, df.binary?)
248
+
249
+ df = doc.field("year")
250
+ assert_equal("year", df.name)
251
+ assert_equal("1865", df.data)
252
+ assert_equal(df.boost, 1.0)
253
+ assert_equal(true, df.stored?)
254
+ assert_equal(false, df.compressed?)
255
+ assert_equal(false, df.indexed?)
256
+ assert_equal(false, df.tokenized?)
257
+ assert_equal(false, df.store_term_vector?)
258
+ assert_equal(false, df.store_positions?)
259
+ assert_equal(false, df.store_offsets?)
260
+ assert_equal(false, df.binary?)
261
+
262
+
263
+ df = doc.field("text")
264
+ assert(df.nil?) # "text" is not stored
265
+ end
266
+
267
+ def test_ir_norms()
268
+ @ir.set_norm(3, "title", 1)
269
+ @ir.set_norm(3, "body", 12)
270
+ @ir.set_norm(3, "author", 145)
271
+ @ir.set_norm(3, "year", 31)
272
+ @ir.set_norm(3, "text", 202)
273
+ @ir.set_norm(25, "text", 20)
274
+ @ir.set_norm(50, "text", 200)
275
+ @ir.set_norm(63, "text", 155)
276
+
277
+ norms = @ir.get_norms("text")
278
+
279
+ assert_equal(202, norms[3])
280
+ assert_equal(20, norms[25])
281
+ assert_equal(200, norms[50])
282
+ assert_equal(155, norms[63])
283
+
284
+ norms = @ir.get_norms("title")
285
+ assert_equal(1, norms[3])
286
+
287
+ norms = @ir.get_norms("body")
288
+ assert_equal(12, norms[3])
289
+
290
+ norms = @ir.get_norms("author")
291
+ assert_equal(145, norms[3])
292
+
293
+ norms = @ir.get_norms("year")
294
+ # TODO: this returns two possible results depending on whether it is
295
+ # a multi reader or a segment reader. If it is a multi reader it will
296
+ # always return an empty set of norms, otherwise it will return nil.
297
+ # I'm not sure what to do here just yet or if this is even an issue.
298
+ #assert(norms.nil?)
299
+
300
+ norms = " " * 164
301
+ @ir.get_norms_into("text", norms, 100)
302
+ assert_equal(202, norms[103])
303
+ assert_equal(20, norms[125])
304
+ assert_equal(200, norms[150])
305
+ assert_equal(155, norms[163])
306
+
307
+ @ir.commit()
308
+
309
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
310
+ iw.optimize()
311
+ iw.close()
312
+
313
+ ir2 = IndexReader.open(@dir, false)
314
+
315
+ norms = " " * 164
316
+ ir2.get_norms_into("text", norms, 100)
317
+ assert_equal(202, norms[103])
318
+ assert_equal(20, norms[125])
319
+ assert_equal(200, norms[150])
320
+ assert_equal(155, norms[163])
321
+ ir2.close()
322
+ end
323
+
324
+ def test_ir_delete()
325
+ doc_count = IndexTestHelper::IR_TEST_DOC_CNT
326
+ assert_equal(false, @ir.has_deletions?())
327
+ assert_equal(doc_count, @ir.max_doc())
328
+ assert_equal(doc_count, @ir.num_docs())
329
+ assert_equal(false, @ir.deleted?(10))
330
+
331
+ @ir.delete(10)
332
+ assert_equal(true, @ir.has_deletions?())
333
+ assert_equal(doc_count, @ir.max_doc())
334
+ assert_equal(doc_count - 1, @ir.num_docs())
335
+ assert_equal(true, @ir.deleted?(10))
336
+
337
+ @ir.delete(10)
338
+ assert_equal(true, @ir.has_deletions?())
339
+ assert_equal(doc_count, @ir.max_doc())
340
+ assert_equal(doc_count - 1, @ir.num_docs())
341
+ assert_equal(true, @ir.deleted?(10))
342
+
343
+ @ir.delete(doc_count - 1)
344
+ assert_equal(true, @ir.has_deletions?())
345
+ assert_equal(doc_count, @ir.max_doc())
346
+ assert_equal(doc_count - 2, @ir.num_docs())
347
+ assert_equal(true, @ir.deleted?(doc_count - 1))
348
+
349
+ @ir.delete(doc_count - 2)
350
+ assert_equal(true, @ir.has_deletions?())
351
+ assert_equal(doc_count, @ir.max_doc())
352
+ assert_equal(doc_count - 3, @ir.num_docs())
353
+ assert_equal(true, @ir.deleted?(doc_count - 2))
354
+
355
+ @ir.undelete_all()
356
+ assert_equal(false, @ir.has_deletions?())
357
+ assert_equal(doc_count, @ir.max_doc())
358
+ assert_equal(doc_count, @ir.num_docs())
359
+ assert_equal(false, @ir.deleted?(10))
360
+ assert_equal(false, @ir.deleted?(doc_count - 2))
361
+ assert_equal(false, @ir.deleted?(doc_count - 1))
362
+
363
+ @ir.delete(10)
364
+ @ir.delete(20)
365
+ @ir.delete(30)
366
+ @ir.delete(40)
367
+ @ir.delete(50)
368
+ @ir.delete(doc_count - 1)
369
+ assert_equal(true, @ir.has_deletions?())
370
+ assert_equal(doc_count, @ir.max_doc())
371
+ assert_equal(doc_count - 6, @ir.num_docs())
372
+
373
+ @ir.commit()
374
+
375
+ ir2 = IndexReader.open(@dir, false)
376
+
377
+ assert_equal(true, ir2.has_deletions?())
378
+ assert_equal(doc_count, ir2.max_doc())
379
+ assert_equal(doc_count - 6, ir2.num_docs())
380
+ assert_equal(true, ir2.deleted?(10))
381
+ assert_equal(true, ir2.deleted?(20))
382
+ assert_equal(true, ir2.deleted?(30))
383
+ assert_equal(true, ir2.deleted?(40))
384
+ assert_equal(true, ir2.deleted?(50))
385
+ assert_equal(true, ir2.deleted?(doc_count - 1))
386
+
387
+ ir2.undelete_all()
388
+ assert_equal(false, ir2.has_deletions?())
389
+ assert_equal(doc_count, ir2.max_doc())
390
+ assert_equal(doc_count, ir2.num_docs())
391
+ assert_equal(false, ir2.deleted?(10))
392
+ assert_equal(false, ir2.deleted?(20))
393
+ assert_equal(false, ir2.deleted?(30))
394
+ assert_equal(false, ir2.deleted?(40))
395
+ assert_equal(false, ir2.deleted?(50))
396
+ assert_equal(false, ir2.deleted?(doc_count - 1))
397
+
398
+ ir2.delete(10)
399
+ ir2.delete(20)
400
+ ir2.delete(30)
401
+ ir2.delete(40)
402
+ ir2.delete(50)
403
+ ir2.delete(doc_count - 1)
404
+
405
+ ir2.commit()
406
+
407
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
408
+ iw.optimize()
409
+ iw.close()
410
+
411
+ ir3 = IndexReader.open(@dir, false)
412
+
413
+ assert(!ir3.has_deletions?())
414
+ assert_equal(doc_count - 6, ir3.max_doc())
415
+ assert_equal(doc_count - 6, ir3.num_docs())
416
+
417
+ ir3.close()
418
+ end
419
+
420
+
421
+ end
422
+
423
+ class SegmentReaderTest < Test::Unit::TestCase
424
+ include IndexReaderCommon
425
+
426
+ def setup()
427
+ @dir = Ferret::Store::RAMDirectory.new()
428
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
429
+ docs = IndexTestHelper.prepare_ir_test_docs()
430
+ IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
431
+ iw << docs[i]
432
+ end
433
+
434
+ # we must optimize here so that SegmentReader is used.
435
+ iw.optimize()
436
+ iw.close()
437
+ @ir = IndexReader.open(@dir, false)
438
+ end
439
+
440
+ def tear_down()
441
+ @ir.close()
442
+ @dir.close()
443
+ end
444
+ end
445
+
446
+ class MultiReaderTest < Test::Unit::TestCase
447
+ include IndexReaderCommon
448
+
449
+ def setup()
450
+ @dir = Ferret::Store::RAMDirectory.new()
451
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
452
+ docs = IndexTestHelper.prepare_ir_test_docs()
453
+ IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
454
+ iw << docs[i]
455
+ end
456
+
457
+ # we mustn't optimize here so that MultiReader is used.
458
+ # iw.optimize()
459
+ iw.close()
460
+ @ir = IndexReader.open(@dir, false)
461
+ end
462
+
463
+ def tear_down()
464
+ @ir.close()
465
+ @dir.close()
466
+ end
467
+ end
468
+
469
+ class IndexReaderTest < Test::Unit::TestCase
470
+ include Ferret::Index
471
+ include Ferret::Analysis
472
+ include Ferret::Document
473
+
474
+ def setup()
475
+ @dir = Ferret::Store::RAMDirectory.new()
476
+ end
477
+
478
+ def tear_down()
479
+ @dir.close()
480
+ end
481
+
482
+ def test_ir_multivalue_fields()
483
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
484
+ doc = Document.new()
485
+ doc << Field.new("tag", "Ruby", Field::Store::YES, Field::Index::NO, Field::TermVector::NO)
486
+ doc << Field.new("tag", "C", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::NO)
487
+ doc << Field.new("body", "this is the body Document Field", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
488
+ doc << Field.new("tag", "Lucene", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS)
489
+ doc << Field.new("tag", "Ferret", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_OFFSETS)
490
+ doc << Field.new("title", "this is the title DocField", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
491
+ doc << Field.new("author", "this is the author field", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
492
+
493
+ fis = FieldInfos.new()
494
+ fis << doc
495
+ assert_equal(4, fis.size)
496
+
497
+ fi = fis["tag"]
498
+ assert_equal(true, fi.indexed?)
499
+ assert_equal(true, fi.store_term_vector?)
500
+ assert_equal(true, fi.store_positions?)
501
+ assert_equal(true, fi.store_offsets?)
502
+
503
+ iw << doc
504
+ iw.close()
505
+
506
+ ir = IndexReader.open(@dir, false)
507
+
508
+ doc = ir.get_document(0)
509
+ assert_equal(4, doc.field_count)
510
+ assert_equal(7, doc.entry_count)
511
+ entries = doc.fields("tag")
512
+ assert_equal(4, entries.size)
513
+ assert_equal("Ruby", entries[0].data)
514
+ assert_equal("C", entries[1].data)
515
+ assert_equal("Lucene", entries[2].data)
516
+ assert_equal("Ferret", entries[3].data)
517
+
518
+ doc.remove_field("tag")
519
+ assert_equal(4, doc.field_count)
520
+ assert_equal(6, doc.entry_count)
521
+ assert_equal("C", doc.field("tag").data)
522
+
523
+ doc.remove_fields("tag")
524
+ assert_equal(3, doc.field_count)
525
+ assert_equal(3, doc.entry_count)
526
+
527
+ ir.delete(0)
528
+ ir.close()
529
+
530
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
531
+ iw << doc
532
+ iw.optimize()
533
+ iw.close()
534
+ doc = nil
535
+
536
+ ir = IndexReader.open(@dir, false)
537
+ doc = ir.get_document(0)
538
+ assert_equal(3, doc.field_count)
539
+ assert_equal(3, doc.entry_count)
540
+
541
+ ir.close()
542
+ end
543
+
544
+ def t(start_offset, end_offset)
545
+ TermVectorOffsetInfo.new(start_offset, end_offset)
546
+ end
547
+
548
+ def do_test_term_vectors(ir)
549
+ tv = ir.get_term_vector(3, "body")
550
+
551
+ assert_equal("body", tv.field)
552
+ assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
553
+ assert_equal([3, 1, 4, 2], tv.term_frequencies)
554
+ assert_equal([[2, 4, 7], [3], [0, 5, 8, 9], [1,6]], tv.positions)
555
+ assert_equal([[t(12,17), t(24,29), t(42,47)],
556
+ [t(18,23)],
557
+ [t(0,5), t(30,35), t(48,53), t(54,59)],
558
+ [t(6,11), t(36,41)]], tv.offsets)
559
+ tv = nil
560
+
561
+ tvs = ir.get_term_vectors(3)
562
+ assert_equal(3, tvs.size)
563
+ tv = tvs[0]
564
+ assert_equal("author", tv.field)
565
+ assert_equal(["Leo", "Tolstoy"], tv.terms)
566
+ assert(tv.offsets.nil?)
567
+ tv = tvs[1]
568
+ assert_equal("body", tv.field)
569
+ assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
570
+ tv = tvs[2]
571
+ assert_equal("title", tv.field)
572
+ assert_equal(["War And Peace"], tv.terms)
573
+ assert(tv.positions.nil?)
574
+ assert_equal(t(0, 13), tv.offsets[0][0])
575
+ end
576
+
577
+ def test_ir_read_while_optimizing()
578
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
579
+ docs = IndexTestHelper.prepare_ir_test_docs()
580
+ IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
581
+ iw << docs[i]
582
+ end
583
+ iw.close()
584
+
585
+ ir = IndexReader.open(@dir, false)
586
+ do_test_term_vectors(ir)
587
+
588
+ iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
589
+ iw.optimize()
590
+ iw.close()
591
+
592
+ do_test_term_vectors(ir)
593
+
594
+ ir.close()
595
+ end
596
+
597
+ def test_ir_read_while_optimizing_on_disk()
598
+ dpath = File.join(File.dirname(__FILE__),
599
+ '../../temp/fsdir')
600
+ fs_dir = Ferret::Store::FSDirectory.get_directory(dpath, true)
601
+
602
+ iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
603
+ docs = IndexTestHelper.prepare_ir_test_docs()
604
+ IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
605
+ iw << docs[i]
606
+ end
607
+ iw.close()
608
+
609
+ ir = IndexReader.open(fs_dir, false)
610
+ do_test_term_vectors(ir)
611
+
612
+ iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
613
+ iw.optimize()
614
+ iw.close()
615
+
616
+ do_test_term_vectors(ir)
617
+
618
+ ir.close()
619
+ fs_dir.close()
620
+ end
621
+ end
622
+