ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ include Ferret::Utils::StringHelper
6
+
7
+ def test_whitespacetokenizer()
8
+ input = StringReader.new('DBalmain@gmail.com is My e-mail 52 #$ address. 23#@$')
9
+ t = WhiteSpaceTokenizer.new(input)
10
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next())
11
+ assert_equal(Token.new('is', 19, 21), t.next())
12
+ assert_equal(Token.new('My', 22, 24), t.next())
13
+ assert_equal(Token.new('e-mail', 25, 31), t.next())
14
+ assert_equal(Token.new('52', 32, 34), t.next())
15
+ assert_equal(Token.new('#$', 37, 39), t.next())
16
+ assert_equal(Token.new('address.', 40, 48), t.next())
17
+ assert_equal(Token.new('23#@$', 49, 54), t.next())
18
+ assert(! t.next())
19
+ end
20
+ end
@@ -0,0 +1,32 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class WordListLoaderTest < Test::Unit::TestCase
4
+ include Ferret::Analysis
5
+ def test_word_set_from_file()
6
+ wl = WordListLoader.word_set_from_file(File.dirname(__FILE__) + '/data/wordfile')
7
+ assert_equal(6, wl.size())
8
+ assert(wl.member?('and'))
9
+ assert(wl.member?('to'))
10
+ assert(wl.member?('it'))
11
+ assert(wl.member?('the'))
12
+ assert(wl.member?('there'))
13
+ assert(wl.member?('their'))
14
+ assert(!wl.member?('horse'))
15
+ assert(!wl.member?('judo'))
16
+ assert(!wl.member?('dairy'))
17
+ end
18
+
19
+ def test_word_set_from_array()
20
+ wl = WordListLoader.word_set_from_array(['and','to','it','the','there','their'])
21
+ assert_equal(6, wl.size())
22
+ assert(wl.member?('and'))
23
+ assert(wl.member?('to'))
24
+ assert(wl.member?('it'))
25
+ assert(wl.member?('the'))
26
+ assert(wl.member?('there'))
27
+ assert(wl.member?('their'))
28
+ assert(!wl.member?('horse'))
29
+ assert(!wl.member?('judo'))
30
+ assert(!wl.member?('dairy'))
31
+ end
32
+ end
@@ -0,0 +1,47 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class DocumentTest < Test::Unit::TestCase
4
+ include Ferret::Document
5
+ def test_document()
6
+ doc = Document.new()
7
+ f11 = Field.new("field1", "value1", Field::Store::YES, Field::Index::NO)
8
+ f12 = Field.new("field1", "value2", Field::Store::YES, Field::Index::NO)
9
+ f13 = Field.new("field1", "value3", Field::Store::YES, Field::Index::NO)
10
+ f21 = Field.new("field2", "value1", Field::Store::YES, Field::Index::NO)
11
+ doc.add_field(f11)
12
+ doc.add_field(f12)
13
+ doc.add_field(f13)
14
+ doc.add_field(f21)
15
+ assert_equal(3, doc.fields("field1").size)
16
+ assert_equal(1, doc.fields("field2").size)
17
+ field = doc.remove_field("field1")
18
+ assert_equal(2, doc.fields("field1").size)
19
+ assert_equal(f11, field)
20
+ assert_equal("value2 value3", doc.values("field1"))
21
+ doc.remove_fields("field1")
22
+ assert_equal(nil, doc.field("field1"))
23
+ end
24
+
25
+ def test_binary_string()
26
+ tmp = []
27
+ 256.times {|i| tmp[i] = i}
28
+ bin1 = tmp.pack("c*")
29
+ tmp = []
30
+ 56.times {|i| tmp[i] = i}
31
+ bin2 = tmp.pack("c*")
32
+ doc = Document.new()
33
+ fs1 = Field.new("field1", "value1", Field::Store::YES, Field::Index::NO)
34
+ fs2 = Field.new("field1", "value2", Field::Store::YES, Field::Index::NO)
35
+ fb1 = Field.new_binary_field("field1", bin1, Field::Store::YES)
36
+ fb2 = Field.new_binary_field("field1", bin2, Field::Store::YES)
37
+
38
+ doc.add_field(fs1)
39
+ doc.add_field(fs2)
40
+ doc.add_field(fb1)
41
+ doc.add_field(fb2)
42
+
43
+ assert_equal(4, doc.fields("field1").size)
44
+ assert_equal("value1 value2", doc.values("field1").strip)
45
+ assert_equal([bin1, bin2], doc.binaries("field1"))
46
+ end
47
+ end
@@ -0,0 +1,80 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class FieldTest < Test::Unit::TestCase
5
+ include Ferret::Document
6
+ include Ferret::Utils
7
+
8
+ def test_store()
9
+ assert_equal("COMPRESS", Field::Store::COMPRESS.to_s)
10
+ assert_equal("YES", Field::Store::YES.to_s)
11
+ assert_equal("NO", Field::Store::NO.to_s)
12
+ end
13
+
14
+ def test_index()
15
+ assert_equal("TOKENIZED", Field::Index::TOKENIZED.to_s)
16
+ assert_equal("UNTOKENIZED", Field::Index::UNTOKENIZED.to_s)
17
+ assert_equal("NO", Field::Index::NO.to_s)
18
+ end
19
+
20
+ def test_term_vector()
21
+ assert_equal("YES", Field::TermVector::YES.to_s)
22
+ assert_equal("NO", Field::TermVector::NO.to_s)
23
+ assert_equal("WITH_POSITIONS", Field::TermVector::WITH_POSITIONS.to_s)
24
+ assert_equal("WITH_OFFSETS", Field::TermVector::WITH_OFFSETS.to_s)
25
+ assert_equal("WITH_POSITIONS_OFFSETS", Field::TermVector::WITH_POSITIONS_OFFSETS.to_s)
26
+ end
27
+
28
+ def test_standard_field()
29
+ f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
30
+ assert_equal("name", f.name)
31
+ assert_equal("value", f.data)
32
+ assert_equal(true, f.stored?)
33
+ assert_equal(true, f.compressed?)
34
+ assert_equal(true, f.indexed?)
35
+ assert_equal(true, f.tokenized?)
36
+ assert_equal(false, f.store_term_vector?)
37
+ assert_equal(false, f.store_offsets?)
38
+ assert_equal(false, f.store_positions?)
39
+ assert_equal(false, f.binary?)
40
+ end
41
+
42
+ def test_set_store()
43
+ f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
44
+ f.stored = Field::Store::NO
45
+ assert_equal(false, f.stored?)
46
+ assert_equal(false, f.compressed?)
47
+ end
48
+
49
+ def test_set_index()
50
+ f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
51
+ f.index = Field::Index::NO
52
+ assert_equal(false, f.indexed?)
53
+ assert_equal(false, f.tokenized?)
54
+ end
55
+
56
+ def test_set_term_vector()
57
+ f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
58
+ f.store_term_vector = Field::TermVector::WITH_POSITIONS_OFFSETS
59
+ assert_equal(true, f.store_term_vector?)
60
+ assert_equal(true, f.store_offsets?)
61
+ assert_equal(true, f.store_positions?)
62
+ end
63
+
64
+ def test_new_binary_field()
65
+ tmp = []
66
+ 256.times {|i| tmp[i] = i}
67
+ bin = tmp.pack("c*")
68
+ f = Field.new_binary_field("name", bin, Field::Store::YES)
69
+ assert_equal("name", f.name)
70
+ assert_equal(bin, f.data)
71
+ assert_equal(true, f.stored?)
72
+ assert_equal(false, f.compressed?)
73
+ assert_equal(false, f.indexed?)
74
+ assert_equal(false, f.tokenized?)
75
+ assert_equal(false, f.store_term_vector?)
76
+ assert_equal(false, f.store_offsets?)
77
+ assert_equal(false, f.store_positions?)
78
+ assert_equal(true, f.binary?)
79
+ end
80
+ end
@@ -0,0 +1,107 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class CompoundFileWriterTest < Test::Unit::TestCase
5
+
6
+ include Ferret::Index
7
+
8
+ def setup()
9
+ @dir = Ferret::Store::RAMDirectory.new
10
+ end
11
+
12
+ def tear_down()
13
+ @dir.close()
14
+ end
15
+
16
+ def test_writer
17
+ file1 = @dir.create_output("file1")
18
+ file2 = @dir.create_output("file2")
19
+ file1.write_int(20)
20
+ file2.write_string('this is file2')
21
+ file1.close()
22
+ file2.close()
23
+ cfile_writer = CompoundFileWriter.new(@dir, "cfile")
24
+ cfile_writer.add_file("file1")
25
+ cfile_writer.add_file("file2")
26
+ cfile_writer.close()
27
+
28
+ cfile = @dir.open_input("cfile")
29
+ assert_equal(2, cfile.read_vint())
30
+ assert_equal(29, cfile.read_long(), "Offset is incorrect")
31
+ assert_equal("file1", cfile.read_string(), "Filename is incorrect")
32
+ assert_equal(33, cfile.read_long(), "Offset is incorrect")
33
+ assert_equal("file2", cfile.read_string(), "Filename is incorrect")
34
+ assert_equal(20, cfile.read_int(), "Content is incorrect")
35
+ assert_equal('this is file2', cfile.read_string(), "Content is incorrect")
36
+ end
37
+ end
38
+
39
+ class CompoundFileReaderTest < Test::Unit::TestCase
40
+
41
+ include Ferret::Index
42
+
43
+ def setup()
44
+ @dir = Ferret::Store::RAMDirectory.new
45
+ end
46
+
47
+ def tear_down()
48
+ @dir.close()
49
+ end
50
+
51
+ def test_reader
52
+ cfile = @dir.create_output("cfile")
53
+ cfile.write_vint(2)
54
+ cfile.write_long(29)
55
+ cfile.write_string('file1')
56
+ cfile.write_long(33)
57
+ cfile.write_string('file2')
58
+ cfile.write_int(20)
59
+ cfile.write_string("this is file 2")
60
+ cfile.close()
61
+
62
+ cfile_reader = CompoundFileReader.new(@dir, "cfile")
63
+ assert_equal(4, cfile_reader.file_length('file1'))
64
+ assert_equal(15, cfile_reader.file_length('file2'))
65
+ file1 = cfile_reader.open_input('file1')
66
+ file2 = cfile_reader.open_input('file2')
67
+ assert_equal(20, file1.read_int())
68
+ assert_equal('this is file 2', file2.read_string())
69
+ file1.close()
70
+ file2.close()
71
+ end
72
+ end
73
+
74
+ class CompoundFileIOTest < Test::Unit::TestCase
75
+
76
+ include Ferret::Index
77
+
78
+ def setup()
79
+ @dir = Ferret::Store::RAMDirectory.new
80
+ end
81
+
82
+ def tear_down()
83
+ @dir.close()
84
+ end
85
+
86
+ def test_buffer
87
+ file1 = @dir.create_output("file1")
88
+ file2 = @dir.create_output("file2")
89
+ file3 = @dir.create_output("file3")
90
+ 20.times { file1.write_int(rand(10000)) }
91
+ file2.write_string('this is file2' * 1000)
92
+ file3.write_string('this is file2')
93
+ file1.close()
94
+ file2.close()
95
+ file3.close()
96
+ cfile_writer = CompoundFileWriter.new(@dir, "cfile")
97
+ cfile_writer.add_file("file1")
98
+ cfile_writer.add_file("file2")
99
+ cfile_writer.add_file("file3")
100
+ cfile_writer.close()
101
+
102
+ cfile_reader = CompoundFileReader.new(@dir, "cfile")
103
+ file2 = cfile_reader.open_input('file2')
104
+ assert_equal('this is file2' * 1000, file2.read_string)
105
+ file2.close
106
+ end
107
+ end
@@ -0,0 +1,119 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class FieldInfosTest < Test::Unit::TestCase
4
+ include Ferret::Index
5
+
6
+ def test_field_info()
7
+ fi = FieldInfo.new("name", true, 1, true, true, true)
8
+ assert_equal(fi.name, "name")
9
+ assert_equal(fi.number, 1)
10
+ assert(fi.indexed?)
11
+ assert(fi.store_term_vector?)
12
+ assert(fi.store_offsets?)
13
+ assert(fi.store_positions?)
14
+
15
+ fi.name = "hello"
16
+ fi.indexed = false
17
+ fi.number = 2
18
+ fi.store_term_vector = false
19
+ fi.store_offset = false
20
+ fi.store_position = false
21
+
22
+ assert_equal(fi.name, "hello")
23
+ assert_equal(fi.number, 2)
24
+ assert(!fi.indexed?)
25
+ assert(!fi.store_term_vector?)
26
+ assert(!fi.store_offsets?)
27
+ assert(!fi.store_positions?)
28
+
29
+ fi.set!(true, true, true, true)
30
+ assert(fi.indexed?)
31
+ assert(fi.store_term_vector?)
32
+ assert(fi.store_offsets?)
33
+ assert(fi.store_positions?)
34
+
35
+ fi = FieldInfo.new("name", true, 1, true)
36
+ assert(!fi.store_offsets?)
37
+ assert(!fi.store_positions?)
38
+ end
39
+
40
+ def fi_test_attr(fi, name, number, indexed, store_tv, store_pos, store_off)
41
+ assert_equal(name, fi.name)
42
+ assert_equal(number, fi.number)
43
+ assert_equal(indexed, fi.indexed?)
44
+ assert_equal(store_tv, fi.store_term_vector?)
45
+ assert_equal(store_pos, fi.store_positions?)
46
+ assert_equal(store_off, fi.store_offsets?)
47
+ end
48
+
49
+ def test_fis_add()
50
+ fis = FieldInfos.new()
51
+ fi = fis.add("field1", false)
52
+ fi_test_attr(fi, "field1", 0, false, false, false, false)
53
+ assert_equal(1, fis.size)
54
+
55
+ fi = fis.add("field1", true, true)
56
+ fi_test_attr(fi, "field1", 0, true, true, false, false)
57
+ assert_equal(1, fis.size)
58
+
59
+ fi = fis.add("field2", false)
60
+ fi_test_attr(fi, "field2", 1, false, false, false, false)
61
+ assert_equal(2, fis.size)
62
+
63
+ fi = fis.add("field1", true, true, true, true)
64
+ assert_equal(fi, fis[fi.number])
65
+ assert_equal(fi, fis["field1"])
66
+ assert_equal(0, fis.field_number("field1"))
67
+ assert_equal(1, fis.field_number("field2"))
68
+ assert_equal(FieldInfos::NOT_A_FIELD, fis.field_number("field3"))
69
+ assert_equal(nil, fis["field3"])
70
+ fi_test_attr(fi, "field1", 0, true, true, true, true)
71
+ assert_equal(2, fis.size)
72
+ end
73
+
74
+ def test_add_doc_fields
75
+ doc = IndexTestHelper.prepare_document
76
+ fis = FieldInfos.new()
77
+ fis << doc
78
+ dir = Ferret::Store::RAMDirectory.new
79
+ fis.write_to_dir(dir, "_test")
80
+ fis2 = FieldInfos.new(dir, "_test")
81
+ assert_equal("text_field1", fis2["text_field1"].name)
82
+ fn = fis2.field_number("text_field2")
83
+ assert_equal("text_field2", fis2[fn].name)
84
+ assert_equal(9, fis2.size)
85
+ assert(fis.has_vectors?)
86
+ end
87
+
88
+ def test_fis_has_vectors
89
+ fis = FieldInfos.new()
90
+ assert(! fis.has_vectors?)
91
+ fis.add("random_field")
92
+ assert(! fis.has_vectors?)
93
+ fis.add("store_term_vector_field", true, true, false, false)
94
+ assert(fis.has_vectors?)
95
+ end
96
+
97
+
98
+ def test_fis_rw()
99
+ fis = FieldInfos.new()
100
+ dir = Ferret::Store::RAMDirectory.new()
101
+ fis.add("field1", false, false, false, false)
102
+ fis.add("field2", true, false, false, false)
103
+ fis.add("field3", true, true, false, false)
104
+ fis.add("field4", true, true, true, false)
105
+ fis.add("field5", true, true, true, true)
106
+ fis.write_to_dir(dir, "fis_rw.test")
107
+ fis = nil
108
+
109
+ fis = FieldInfos.new(dir, "fis_rw.test")
110
+ fi_test_attr(fis[0], "field1", 0, false, false, false, false)
111
+ fi_test_attr(fis[1], "field2", 1, true, false, false, false)
112
+ fi_test_attr(fis[2], "field3", 2, true, true, false, false)
113
+ fi_test_attr(fis[3], "field4", 3, true, true, true, false)
114
+ fi_test_attr(fis[4], "field5", 4, true, true, true, true)
115
+
116
+ assert_equal(5, fis.size)
117
+ end
118
+
119
+ end
@@ -0,0 +1,167 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+ class FieldsWriterTest < Test::Unit::TestCase
4
+
5
+ include Ferret::Index
6
+ include Ferret::Document
7
+
8
+ def setup()
9
+ @dir = Ferret::Store::RAMDirectory.new
10
+ end
11
+
12
+ def tear_down()
13
+ @dir.close()
14
+ end
15
+
16
+ def test_writer
17
+ doc = Document.new
18
+ doc << Field.new("name", "daily news", Field::Store::YES)
19
+ doc << Field.new("content", "Nothing happened today.", Field::Store::YES)
20
+
21
+ infos = FieldInfos.new
22
+ infos << doc
23
+
24
+ writer = FieldsWriter.new(@dir, "fieldswritertest", infos)
25
+ writer << doc
26
+ writer.close
27
+
28
+ fstream = @dir.open_input("fieldswritertest.fdt")
29
+ istream = @dir.open_input("fieldswritertest.fdx")
30
+
31
+ stored = fstream.read_vint
32
+ field_num1 = fstream.read_vint
33
+ byte1 = fstream.read_byte
34
+ data1 = fstream.read_string
35
+ assert( stored == 2 )
36
+ assert( (byte1 |= FieldsWriter::FIELD_IS_TOKENIZED) != 0 )
37
+ assert( data1 == "daily news" )
38
+
39
+ field_num2 = fstream.read_vint
40
+ byte2 = fstream.read_byte
41
+ data2 = fstream.read_string
42
+ assert( (byte2 |= FieldsWriter::FIELD_IS_TOKENIZED) != 0 )
43
+ assert( data2 == "Nothing happened today." )
44
+
45
+ end
46
+ end
47
+
48
+ class FieldsReaderTest < Test::Unit::TestCase
49
+
50
+ include Ferret::Index
51
+ include Ferret::Document
52
+
53
+ def setup()
54
+ @dir = Ferret::Store::RAMDirectory.new
55
+ end
56
+
57
+ def tear_down()
58
+ @dir.close()
59
+ end
60
+
61
+ def test_doc
62
+ doc = Document.new
63
+ doc << Field.new("name", "daily news")
64
+ doc << Field.new("content", "Nothing happened today.")
65
+
66
+ infos = FieldInfos.new
67
+ infos << doc
68
+
69
+ fstream = @dir.create_output("fieldsreadertest.fdt")
70
+ istream = @dir.create_output("fieldsreadertest.fdx")
71
+
72
+ istream.write_long(0)
73
+ istream.close
74
+ fstream.write_vint(2)
75
+ fstream.write_vint(0)
76
+ fstream.write_byte(0)
77
+ fstream.write_string("daily news")
78
+ fstream.write_vint(1)
79
+ fstream.write_byte(0)
80
+ fstream.write_string("Nothing happened today.")
81
+ fstream.close
82
+
83
+ reader = FieldsReader.new(@dir, "fieldsreadertest", infos)
84
+ docres = reader.doc(0)
85
+
86
+ assert_equal(docres.field("name").data, "daily news")
87
+ assert_equal(docres.field("content").data, "Nothing happened today.")
88
+ end
89
+ end
90
+
91
+ class FieldsIOTest < Test::Unit::TestCase
92
+
93
+ include Ferret::Index
94
+ include Ferret::Document
95
+
96
+ def setup()
97
+ @dir = Ferret::Store::RAMDirectory.new
98
+ doc = IndexTestHelper.prepare_document()
99
+ infos = FieldInfos.new
100
+ infos << doc
101
+
102
+ writer = FieldsWriter.new(@dir, "field_types", infos)
103
+ writer << doc
104
+ writer.close
105
+
106
+ reader = FieldsReader.new(@dir, "field_types", infos)
107
+ @docres = reader.doc(0)
108
+ end
109
+
110
+ def tear_down()
111
+ @dir.close()
112
+ end
113
+
114
+ def test_text_field_no_term_vector
115
+ field = @docres.field("text_field1")
116
+ check_field_values(field, "field one text", true, true, true, false, false)
117
+ end
118
+
119
+ def test_text_field_term_vector
120
+ field = @docres.field("text_field2")
121
+ check_field_values(field, "field field field two text", true, true, true, true, false)
122
+ end
123
+
124
+ def test_key_field
125
+ field = @docres.field("key_field")
126
+ check_field_values(field, "keyword", true, true, false, false, false)
127
+ end
128
+
129
+ def test_unindexed_field
130
+ field = @docres.field("unindexed_field")
131
+ check_field_values(field, "unindexed field text", true, false, false, false, false)
132
+ end
133
+
134
+ def test_unstored_field_no_term_vector
135
+ field = @docres.field("unstored_field1")
136
+ assert_equal(nil, field)
137
+ end
138
+
139
+ def test_compressed_field
140
+ field = @docres.field("compressed_field")
141
+ check_field_values(field, "compressed text", true, true, true, true, false)
142
+ end
143
+
144
+ def test_binary_field
145
+ bin = IndexTestHelper::BINARY_DATA
146
+ field = @docres.field("binary_field")
147
+ check_field_values(field, bin, true, false, false, false, true)
148
+ end
149
+
150
+ def test_compressed_binary_field
151
+ cbin = IndexTestHelper::COMPRESSED_BINARY_DATA
152
+ field = @docres.field("compressed_binary_field")
153
+ check_field_values(field, cbin, true, false, false, false, true)
154
+ end
155
+
156
+
157
+ private
158
+
159
+ def check_field_values(field, value, stored, indexed, tokenized, term_vector, binary)
160
+ assert_equal(value, field.data)
161
+ assert_equal(stored, field.stored?)
162
+ assert_equal(indexed, field.indexed?)
163
+ assert_equal(tokenized, field.tokenized?)
164
+ assert_equal(term_vector, field.store_term_vector?)
165
+ assert_equal(binary, field.binary?)
166
+ end
167
+ end