ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_whitespacetokenizer()
8
+ input = StringReader.new('DBalmain@gmail.com is My e-mail 52 #$ address. 23#@$')
9
+ t = WhiteSpaceTokenizer.new(input)
10
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next())
11
+ assert_equal(Token.new('is', 19, 21), t.next())
12
+ assert_equal(Token.new('My', 22, 24), t.next())
13
+ assert_equal(Token.new('e-mail', 25, 31), t.next())
14
+ assert_equal(Token.new('52', 32, 34), t.next())
15
+ assert_equal(Token.new('#$', 37, 39), t.next())
16
+ assert_equal(Token.new('address.', 40, 48), t.next())
17
+ assert_equal(Token.new('23#@$', 49, 54), t.next())
18
+ assert(! t.next())
19
+ end
20
+ end
@@ -0,0 +1,32 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class WordListLoaderTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ def test_word_set_from_file()
6
+ wl = WordListLoader.word_set_from_file(File.dirname(__FILE__) + '/data/wordfile')
7
+ assert_equal(6, wl.size())
8
+ assert(wl.member?('and'))
9
+ assert(wl.member?('to'))
10
+ assert(wl.member?('it'))
11
+ assert(wl.member?('the'))
12
+ assert(wl.member?('there'))
13
+ assert(wl.member?('their'))
14
+ assert(!wl.member?('horse'))
15
+ assert(!wl.member?('judo'))
16
+ assert(!wl.member?('dairy'))
17
+ end
18
+
19
+ def test_word_set_from_array()
20
+ wl = WordListLoader.word_set_from_array(['and','to','it','the','there','their'])
21
+ assert_equal(6, wl.size())
22
+ assert(wl.member?('and'))
23
+ assert(wl.member?('to'))
24
+ assert(wl.member?('it'))
25
+ assert(wl.member?('the'))
26
+ assert(wl.member?('there'))
27
+ assert(wl.member?('their'))
28
+ assert(!wl.member?('horse'))
29
+ assert(!wl.member?('judo'))
30
+ assert(!wl.member?('dairy'))
31
+ end
32
+ end
@@ -0,0 +1,47 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class DocumentTest < Test::Unit::TestCase
4
+ include Ferret::Document
5
+ def test_document()
6
+ doc = Document.new()
7
+ f11 = Field.new("field1", "value1", Field::Store::YES, Field::Index::NO)
8
+ f12 = Field.new("field1", "value2", Field::Store::YES, Field::Index::NO)
9
+ f13 = Field.new("field1", "value3", Field::Store::YES, Field::Index::NO)
10
+ f21 = Field.new("field2", "value1", Field::Store::YES, Field::Index::NO)
11
+ doc.add_field(f11)
12
+ doc.add_field(f12)
13
+ doc.add_field(f13)
14
+ doc.add_field(f21)
15
+ assert_equal(3, doc.fields("field1").size)
16
+ assert_equal(1, doc.fields("field2").size)
17
+ field = doc.remove_field("field1")
18
+ assert_equal(2, doc.fields("field1").size)
19
+ assert_equal(f11, field)
20
+ assert_equal("value2 value3", doc.values("field1"))
21
+ doc.remove_fields("field1")
22
+ assert_equal(nil, doc.field("field1"))
23
+ end
24
+
25
+ def test_binary_string()
26
+ tmp = []
27
+ 256.times {|i| tmp[i] = i}
28
+ bin1 = tmp.pack("c*")
29
+ tmp = []
30
+ 56.times {|i| tmp[i] = i}
31
+ bin2 = tmp.pack("c*")
32
+ doc = Document.new()
33
+ fs1 = Field.new("field1", "value1", Field::Store::YES, Field::Index::NO)
34
+ fs2 = Field.new("field1", "value2", Field::Store::YES, Field::Index::NO)
35
+ fb1 = Field.new_binary_field("field1", bin1, Field::Store::YES)
36
+ fb2 = Field.new_binary_field("field1", bin2, Field::Store::YES)
37
+
38
+ doc.add_field(fs1)
39
+ doc.add_field(fs2)
40
+ doc.add_field(fb1)
41
+ doc.add_field(fb2)
42
+
43
+ assert_equal(4, doc.fields("field1").size)
44
+ assert_equal("value1 value2", doc.values("field1").strip)
45
+ assert_equal([bin1, bin2], doc.binaries("field1"))
46
+ end
47
+ end
@@ -0,0 +1,80 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class FieldTest < Test::Unit::TestCase
5
+ include Ferret::Document
6
+ include Ferret::Utils
7
+
8
+ def test_store()
9
+ assert_equal("COMPRESS", Field::Store::COMPRESS.to_s)
10
+ assert_equal("YES", Field::Store::YES.to_s)
11
+ assert_equal("NO", Field::Store::NO.to_s)
12
+ end
13
+
14
+ def test_index()
15
+ assert_equal("TOKENIZED", Field::Index::TOKENIZED.to_s)
16
+ assert_equal("UNTOKENIZED", Field::Index::UNTOKENIZED.to_s)
17
+ assert_equal("NO", Field::Index::NO.to_s)
18
+ end
19
+
20
+ def test_term_vector()
21
+ assert_equal("YES", Field::TermVector::YES.to_s)
22
+ assert_equal("NO", Field::TermVector::NO.to_s)
23
+ assert_equal("WITH_POSITIONS", Field::TermVector::WITH_POSITIONS.to_s)
24
+ assert_equal("WITH_OFFSETS", Field::TermVector::WITH_OFFSETS.to_s)
25
+ assert_equal("WITH_POSITIONS_OFFSETS", Field::TermVector::WITH_POSITIONS_OFFSETS.to_s)
26
+ end
27
+
28
+ def test_standard_field()
29
+ f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
30
+ assert_equal("name", f.name)
31
+ assert_equal("value", f.data)
32
+ assert_equal(true, f.stored?)
33
+ assert_equal(true, f.compressed?)
34
+ assert_equal(true, f.indexed?)
35
+ assert_equal(true, f.tokenized?)
36
+ assert_equal(false, f.store_term_vector?)
37
+ assert_equal(false, f.store_offsets?)
38
+ assert_equal(false, f.store_positions?)
39
+ assert_equal(false, f.binary?)
40
+ end
41
+
42
+ def test_set_store()
43
+ f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
44
+ f.stored = Field::Store::NO
45
+ assert_equal(false, f.stored?)
46
+ assert_equal(false, f.compressed?)
47
+ end
48
+
49
+ def test_set_index()
50
+ f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
51
+ f.index = Field::Index::NO
52
+ assert_equal(false, f.indexed?)
53
+ assert_equal(false, f.tokenized?)
54
+ end
55
+
56
+ def test_set_term_vector()
57
+ f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
58
+ f.store_term_vector = Field::TermVector::WITH_POSITIONS_OFFSETS
59
+ assert_equal(true, f.store_term_vector?)
60
+ assert_equal(true, f.store_offsets?)
61
+ assert_equal(true, f.store_positions?)
62
+ end
63
+
64
+ def test_new_binary_field()
65
+ tmp = []
66
+ 256.times {|i| tmp[i] = i}
67
+ bin = tmp.pack("c*")
68
+ f = Field.new_binary_field("name", bin, Field::Store::YES)
69
+ assert_equal("name", f.name)
70
+ assert_equal(bin, f.data)
71
+ assert_equal(true, f.stored?)
72
+ assert_equal(false, f.compressed?)
73
+ assert_equal(false, f.indexed?)
74
+ assert_equal(false, f.tokenized?)
75
+ assert_equal(false, f.store_term_vector?)
76
+ assert_equal(false, f.store_offsets?)
77
+ assert_equal(false, f.store_positions?)
78
+ assert_equal(true, f.binary?)
79
+ end
80
+ end
@@ -0,0 +1,107 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class CompoundFileWriterTest < Test::Unit::TestCase
5
+
6
+ include Ferret::Index
7
+
8
+ def setup()
9
+ @dir = Ferret::Store::RAMDirectory.new
10
+ end
11
+
12
+ def tear_down()
13
+ @dir.close()
14
+ end
15
+
16
+ def test_writer
17
+ file1 = @dir.create_output("file1")
18
+ file2 = @dir.create_output("file2")
19
+ file1.write_int(20)
20
+ file2.write_string('this is file2')
21
+ file1.close()
22
+ file2.close()
23
+ cfile_writer = CompoundFileWriter.new(@dir, "cfile")
24
+ cfile_writer.add_file("file1")
25
+ cfile_writer.add_file("file2")
26
+ cfile_writer.close()
27
+
28
+ cfile = @dir.open_input("cfile")
29
+ assert_equal(2, cfile.read_vint())
30
+ assert_equal(29, cfile.read_long(), "Offset is incorrect")
31
+ assert_equal("file1", cfile.read_string(), "Filename is incorrect")
32
+ assert_equal(33, cfile.read_long(), "Offset is incorrect")
33
+ assert_equal("file2", cfile.read_string(), "Filename is incorrect")
34
+ assert_equal(20, cfile.read_int(), "Content is incorrect")
35
+ assert_equal('this is file2', cfile.read_string(), "Content is incorrect")
36
+ end
37
+ end
38
+
39
+ class CompoundFileReaderTest < Test::Unit::TestCase
40
+
41
+ include Ferret::Index
42
+
43
+ def setup()
44
+ @dir = Ferret::Store::RAMDirectory.new
45
+ end
46
+
47
+ def tear_down()
48
+ @dir.close()
49
+ end
50
+
51
+ def test_reader
52
+ cfile = @dir.create_output("cfile")
53
+ cfile.write_vint(2)
54
+ cfile.write_long(29)
55
+ cfile.write_string('file1')
56
+ cfile.write_long(33)
57
+ cfile.write_string('file2')
58
+ cfile.write_int(20)
59
+ cfile.write_string("this is file 2")
60
+ cfile.close()
61
+
62
+ cfile_reader = CompoundFileReader.new(@dir, "cfile")
63
+ assert_equal(4, cfile_reader.file_length('file1'))
64
+ assert_equal(15, cfile_reader.file_length('file2'))
65
+ file1 = cfile_reader.open_input('file1')
66
+ file2 = cfile_reader.open_input('file2')
67
+ assert_equal(20, file1.read_int())
68
+ assert_equal('this is file 2', file2.read_string())
69
+ file1.close()
70
+ file2.close()
71
+ end
72
+ end
73
+
74
+ class CompoundFileIOTest < Test::Unit::TestCase
75
+
76
+ include Ferret::Index
77
+
78
+ def setup()
79
+ @dir = Ferret::Store::RAMDirectory.new
80
+ end
81
+
82
+ def tear_down()
83
+ @dir.close()
84
+ end
85
+
86
+ def test_buffer
87
+ file1 = @dir.create_output("file1")
88
+ file2 = @dir.create_output("file2")
89
+ file3 = @dir.create_output("file3")
90
+ 20.times { file1.write_int(rand(10000)) }
91
+ file2.write_string('this is file2' * 1000)
92
+ file3.write_string('this is file2')
93
+ file1.close()
94
+ file2.close()
95
+ file3.close()
96
+ cfile_writer = CompoundFileWriter.new(@dir, "cfile")
97
+ cfile_writer.add_file("file1")
98
+ cfile_writer.add_file("file2")
99
+ cfile_writer.add_file("file3")
100
+ cfile_writer.close()
101
+
102
+ cfile_reader = CompoundFileReader.new(@dir, "cfile")
103
+ file2 = cfile_reader.open_input('file2')
104
+ assert_equal('this is file2' * 1000, file2.read_string)
105
+ file2.close
106
+ end
107
+ end
@@ -0,0 +1,119 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class FieldInfosTest < Test::Unit::TestCase
4
+ include Ferret::Index
5
+
6
+ def test_field_info()
7
+ fi = FieldInfo.new("name", true, 1, true, true, true)
8
+ assert_equal(fi.name, "name")
9
+ assert_equal(fi.number, 1)
10
+ assert(fi.indexed?)
11
+ assert(fi.store_term_vector?)
12
+ assert(fi.store_offsets?)
13
+ assert(fi.store_positions?)
14
+
15
+ fi.name = "hello"
16
+ fi.indexed = false
17
+ fi.number = 2
18
+ fi.store_term_vector = false
19
+ fi.store_offset = false
20
+ fi.store_position = false
21
+
22
+ assert_equal(fi.name, "hello")
23
+ assert_equal(fi.number, 2)
24
+ assert(!fi.indexed?)
25
+ assert(!fi.store_term_vector?)
26
+ assert(!fi.store_offsets?)
27
+ assert(!fi.store_positions?)
28
+
29
+ fi.set!(true, true, true, true)
30
+ assert(fi.indexed?)
31
+ assert(fi.store_term_vector?)
32
+ assert(fi.store_offsets?)
33
+ assert(fi.store_positions?)
34
+
35
+ fi = FieldInfo.new("name", true, 1, true)
36
+ assert(!fi.store_offsets?)
37
+ assert(!fi.store_positions?)
38
+ end
39
+
40
+ def fi_test_attr(fi, name, number, indexed, store_tv, store_pos, store_off)
41
+ assert_equal(name, fi.name)
42
+ assert_equal(number, fi.number)
43
+ assert_equal(indexed, fi.indexed?)
44
+ assert_equal(store_tv, fi.store_term_vector?)
45
+ assert_equal(store_pos, fi.store_positions?)
46
+ assert_equal(store_off, fi.store_offsets?)
47
+ end
48
+
49
+ def test_fis_add()
50
+ fis = FieldInfos.new()
51
+ fi = fis.add("field1", false)
52
+ fi_test_attr(fi, "field1", 0, false, false, false, false)
53
+ assert_equal(1, fis.size)
54
+
55
+ fi = fis.add("field1", true, true)
56
+ fi_test_attr(fi, "field1", 0, true, true, false, false)
57
+ assert_equal(1, fis.size)
58
+
59
+ fi = fis.add("field2", false)
60
+ fi_test_attr(fi, "field2", 1, false, false, false, false)
61
+ assert_equal(2, fis.size)
62
+
63
+ fi = fis.add("field1", true, true, true, true)
64
+ assert_equal(fi, fis[fi.number])
65
+ assert_equal(fi, fis["field1"])
66
+ assert_equal(0, fis.field_number("field1"))
67
+ assert_equal(1, fis.field_number("field2"))
68
+ assert_equal(FieldInfos::NOT_A_FIELD, fis.field_number("field3"))
69
+ assert_equal(nil, fis["field3"])
70
+ fi_test_attr(fi, "field1", 0, true, true, true, true)
71
+ assert_equal(2, fis.size)
72
+ end
73
+
74
+ def test_add_doc_fields
75
+ doc = IndexTestHelper.prepare_document
76
+ fis = FieldInfos.new()
77
+ fis << doc
78
+ dir = Ferret::Store::RAMDirectory.new
79
+ fis.write_to_dir(dir, "_test")
80
+ fis2 = FieldInfos.new(dir, "_test")
81
+ assert_equal("text_field1", fis2["text_field1"].name)
82
+ fn = fis2.field_number("text_field2")
83
+ assert_equal("text_field2", fis2[fn].name)
84
+ assert_equal(9, fis2.size)
85
+ assert(fis.has_vectors?)
86
+ end
87
+
88
+ def test_fis_has_vectors
89
+ fis = FieldInfos.new()
90
+ assert(! fis.has_vectors?)
91
+ fis.add("random_field")
92
+ assert(! fis.has_vectors?)
93
+ fis.add("store_term_vector_field", true, true, false, false)
94
+ assert(fis.has_vectors?)
95
+ end
96
+
97
+
98
+ def test_fis_rw()
99
+ fis = FieldInfos.new()
100
+ dir = Ferret::Store::RAMDirectory.new()
101
+ fis.add("field1", false, false, false, false)
102
+ fis.add("field2", true, false, false, false)
103
+ fis.add("field3", true, true, false, false)
104
+ fis.add("field4", true, true, true, false)
105
+ fis.add("field5", true, true, true, true)
106
+ fis.write_to_dir(dir, "fis_rw.test")
107
+ fis = nil
108
+
109
+ fis = FieldInfos.new(dir, "fis_rw.test")
110
+ fi_test_attr(fis[0], "field1", 0, false, false, false, false)
111
+ fi_test_attr(fis[1], "field2", 1, true, false, false, false)
112
+ fi_test_attr(fis[2], "field3", 2, true, true, false, false)
113
+ fi_test_attr(fis[3], "field4", 3, true, true, true, false)
114
+ fi_test_attr(fis[4], "field5", 4, true, true, true, true)
115
+
116
+ assert_equal(5, fis.size)
117
+ end
118
+
119
+ end
@@ -0,0 +1,167 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class FieldsWriterTest < Test::Unit::TestCase
4
+
5
+ include Ferret::Index
6
+ include Ferret::Document
7
+
8
+ def setup()
9
+ @dir = Ferret::Store::RAMDirectory.new
10
+ end
11
+
12
+ def tear_down()
13
+ @dir.close()
14
+ end
15
+
16
+ def test_writer
17
+ doc = Document.new
18
+ doc << Field.new("name", "daily news", Field::Store::YES)
19
+ doc << Field.new("content", "Nothing happened today.", Field::Store::YES)
20
+
21
+ infos = FieldInfos.new
22
+ infos << doc
23
+
24
+ writer = FieldsWriter.new(@dir, "fieldswritertest", infos)
25
+ writer << doc
26
+ writer.close
27
+
28
+ fstream = @dir.open_input("fieldswritertest.fdt")
29
+ istream = @dir.open_input("fieldswritertest.fdx")
30
+
31
+ stored = fstream.read_vint
32
+ field_num1 = fstream.read_vint
33
+ byte1 = fstream.read_byte
34
+ data1 = fstream.read_string
35
+ assert( stored == 2 )
36
+ assert( (byte1 |= FieldsWriter::FIELD_IS_TOKENIZED) != 0 )
37
+ assert( data1 == "daily news" )
38
+
39
+ field_num2 = fstream.read_vint
40
+ byte2 = fstream.read_byte
41
+ data2 = fstream.read_string
42
+ assert( (byte2 |= FieldsWriter::FIELD_IS_TOKENIZED) != 0 )
43
+ assert( data2 == "Nothing happened today." )
44
+
45
+ end
46
+ end
47
+
48
+ class FieldsReaderTest < Test::Unit::TestCase
49
+
50
+ include Ferret::Index
51
+ include Ferret::Document
52
+
53
+ def setup()
54
+ @dir = Ferret::Store::RAMDirectory.new
55
+ end
56
+
57
+ def tear_down()
58
+ @dir.close()
59
+ end
60
+
61
+ def test_doc
62
+ doc = Document.new
63
+ doc << Field.new("name", "daily news")
64
+ doc << Field.new("content", "Nothing happened today.")
65
+
66
+ infos = FieldInfos.new
67
+ infos << doc
68
+
69
+ fstream = @dir.create_output("fieldsreadertest.fdt")
70
+ istream = @dir.create_output("fieldsreadertest.fdx")
71
+
72
+ istream.write_long(0)
73
+ istream.close
74
+ fstream.write_vint(2)
75
+ fstream.write_vint(0)
76
+ fstream.write_byte(0)
77
+ fstream.write_string("daily news")
78
+ fstream.write_vint(1)
79
+ fstream.write_byte(0)
80
+ fstream.write_string("Nothing happened today.")
81
+ fstream.close
82
+
83
+ reader = FieldsReader.new(@dir, "fieldsreadertest", infos)
84
+ docres = reader.doc(0)
85
+
86
+ assert_equal(docres.field("name").data, "daily news")
87
+ assert_equal(docres.field("content").data, "Nothing happened today.")
88
+ end
89
+ end
90
+
91
+ class FieldsIOTest < Test::Unit::TestCase
92
+
93
+ include Ferret::Index
94
+ include Ferret::Document
95
+
96
+ def setup()
97
+ @dir = Ferret::Store::RAMDirectory.new
98
+ doc = IndexTestHelper.prepare_document()
99
+ infos = FieldInfos.new
100
+ infos << doc
101
+
102
+ writer = FieldsWriter.new(@dir, "field_types", infos)
103
+ writer << doc
104
+ writer.close
105
+
106
+ reader = FieldsReader.new(@dir, "field_types", infos)
107
+ @docres = reader.doc(0)
108
+ end
109
+
110
+ def tear_down()
111
+ @dir.close()
112
+ end
113
+
114
+ def test_text_field_no_term_vector
115
+ field = @docres.field("text_field1")
116
+ check_field_values(field, "field one text", true, true, true, false, false)
117
+ end
118
+
119
+ def test_text_field_term_vector
120
+ field = @docres.field("text_field2")
121
+ check_field_values(field, "field field field two text", true, true, true, true, false)
122
+ end
123
+
124
+ def test_key_field
125
+ field = @docres.field("key_field")
126
+ check_field_values(field, "keyword", true, true, false, false, false)
127
+ end
128
+
129
+ def test_unindexed_field
130
+ field = @docres.field("unindexed_field")
131
+ check_field_values(field, "unindexed field text", true, false, false, false, false)
132
+ end
133
+
134
+ def test_unstored_field_no_term_vector
135
+ field = @docres.field("unstored_field1")
136
+ assert_equal(nil, field)
137
+ end
138
+
139
+ def test_compressed_field
140
+ field = @docres.field("compressed_field")
141
+ check_field_values(field, "compressed text", true, true, true, true, false)
142
+ end
143
+
144
+ def test_binary_field
145
+ bin = IndexTestHelper::BINARY_DATA
146
+ field = @docres.field("binary_field")
147
+ check_field_values(field, bin, true, false, false, false, true)
148
+ end
149
+
150
+ def test_compressed_binary_field
151
+ cbin = IndexTestHelper::COMPRESSED_BINARY_DATA
152
+ field = @docres.field("compressed_binary_field")
153
+ check_field_values(field, cbin, true, false, false, false, true)
154
+ end
155
+
156
+
157
+ private
158
+
159
+ def check_field_values(field, value, stored, indexed, tokenized, term_vector, binary)
160
+ assert_equal(value, field.data)
161
+ assert_equal(stored, field.stored?)
162
+ assert_equal(indexed, field.indexed?)
163
+ assert_equal(tokenized, field.tokenized?)
164
+ assert_equal(term_vector, field.store_term_vector?)
165
+ assert_equal(binary, field.binary?)
166
+ end
167
+ end