ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,19 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class TermInfoTest < Test::Unit::TestCase
5
+ include Ferret::Index
6
+ def test_term()
7
+ ti1 = TermInfo.new(1, 2, 3, 1)
8
+ assert_equal(ti1.doc_freq, 1)
9
+ assert_equal(ti1.freq_pointer, 2)
10
+ assert_equal(ti1.prox_pointer, 3)
11
+ assert_equal(ti1.skip_offset, 1)
12
+ ti2 = ti1.copy_of()
13
+ assert(ti1 == ti2)
14
+ ti2 = TermInfo.new(10, 9, 8)
15
+ assert(ti1 != ti2)
16
+ ti2.set!(ti1)
17
+ assert(ti1 == ti2)
18
+ end
19
+ end
@@ -0,0 +1,192 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class TermInfosIOTest < Test::Unit::TestCase
5
+ include Ferret::Index
6
+
7
+ DICT = [ "duad", "dual", "dualism", "dualist", "duality", "dualize", "duan",
8
+ "duarchy", "dub", "dubber", "dubbin", "dubbing", "dubiety", "dubiosity",
9
+ "dubious", "dubiously", "dubiousness", "dubitate", "dubitation", "dubnium",
10
+ "dubonnet", "ducal", "ducat", "ducatoon", "duce", "duchess", "duchesse",
11
+ "duchy", "duck", "duckbill", "duckboard", "ducker", "duckie", "ducking",
12
+ "duckling", "duckpin", "duckshove", "duckshover", "ducktail", "duckwalk",
13
+ "duckweed", "ducky", "duct", "ductile", "ductileness", "ductility",
14
+ "ducting", "ductless", "ductule", "ductulus", "ductwork", "dud", "dudder",
15
+ "duddery", "duddie", "duddy", "dude", "dudeen", "dudgeon", "due",
16
+ "duecento", "duel", "dueler", "dueling", "duelist", "dueller", "duelling",
17
+ "duellist", "duello", "duende", "dueness", "duenna", "duennaship", "duet",
18
+ "duette", "duettino", "duettist", "duetto", "duff", "duffel", "duffer",
19
+ "duffle", "dufus", "dug", "dugong", "dugout", "duiker", "duit", "duke",
20
+ "dukedom", "dukeling", "dukery", "dukeship", "dulcamara", "dulcet",
21
+ "dulcian", "dulciana", "dulcification", "dulcify", "dulcimer", "dulcimore",
22
+ "dulcinea", "dulcitone", "dulcorate", "dule", "dulfer", "dulia", "dull",
23
+ "dullard", "dullness", "dullsville", "dully", "dulness", "dulocracy",
24
+ "dulosis", "dulse", "duly", "duma", "dumaist", "dumb", "dumbass",
25
+ "dumbbell", "dumbcane", "dumbfound", "dumbfounder", "dumbhead",
26
+ "dumbledore", "dumbly", "dumbness", "dumbo", "dumbstruck", "dumbwaiter",
27
+ "dumdum", "dumfound", "dummerer", "dummkopf", "dummy", "dumortierite",
28
+ "dump", "dumpbin", "dumpcart", "dumper", "dumpiness", "dumping",
29
+ "dumpling", "dumplings", "dumpsite", "dumpster", "dumpy", "dun", "dunam",
30
+ "dunce", "dunch", "dunder", "dunderhead", "dunderheadedness", "dunderpate",
31
+ "dune", "duneland", "dunfish", "dung", "dungaree", "dungeon", "dungeoner",
32
+ "dungheap", "dunghill", "dungy", "dunite", "duniwassal", "dunk", "dunker",
33
+ "dunlin", "dunnage", "dunnakin", "dunness", "dunnite", "dunnock", "dunny",
34
+ "dunt", "duo", "duodecillion", "duodecimal", "duodecimo", "duodenectomy",
35
+ "duodenum", "duolog", "duologue", "duomo", "duopoly", "duopsony",
36
+ "duotone", "dup", "dupability", "dupatta", "dupe", "duper", "dupery",
37
+ "dupion", "duple", "duplet", "duplex", "duplexer", "duplexity",
38
+ "duplicability", "duplicand", "duplicate", "duplication", "duplicator",
39
+ "duplicature", "duplicitousness", "duplicity", "dupondius", "duppy",
40
+ "dura", "durability", "durable", "durableness", "durably", "dural",
41
+ "duralumin", "duramen", "durance", "duration", "durative", "durbar",
42
+ "dure", "dures", "duress", "durgan", "durian", "durion", "durmast",
43
+ "durn", "durned", "duro", "duroc", "durometer", "durr", "durra", "durrie",
44
+ "durukuli", "durum", "durzi", "dusk", "duskiness", "dusky", "dust",
45
+ "dustbin", "dustcart", "dustcloth", "dustcover", "duster", "dustheap",
46
+ "dustiness", "dusting", "dustless", "dustman", "dustmop", "dustoff",
47
+ "dustpan", "dustpanful", "dustrag", "dustsheet", "dustup", "dusty",
48
+ "dutch", "dutchman", "duteous", "duteously", "duteousness", "dutiability",
49
+ "dutiable", "dutifulness", "duty", "duumvir", "duumvirate", "duvet",
50
+ "duvetine", "duvetyn", "duvetyne", "dux", "duyker"]
51
+
52
+ TEST_SEGMENT = "_test"
53
+
54
+ def setup()
55
+ @dir = Ferret::Store::RAMDirectory.new
56
+ end
57
+
58
+ def tear_down()
59
+ @dir.close()
60
+ end
61
+
62
+ def test_two_field_io
63
+ term_dumbly = Term.new("word", "dumbly")
64
+ term_dualize = Term.new("word", "dualize")
65
+ term_rev_dualize = Term.new("reverse", "ezilaud")
66
+
67
+ fis = FieldInfos.new
68
+ fis.add("word", true, true)
69
+ fis.add("reverse", true, true)
70
+ terms = []
71
+ term_infos = []
72
+ tiw = TermInfosWriter.new(@dir, TEST_SEGMENT+"G", fis, 128)
73
+
74
+ reverse_words = []
75
+ DICT.each { |word| reverse_words << word.reverse }
76
+ reverse_words.sort!
77
+ reverse_words.each_with_index do |word, i|
78
+ tiw.add(Term.new("reverse", word), TermInfo.new(1, i, i, 0))
79
+ end
80
+ DICT.each_with_index do |word, i|
81
+ tiw.add(Term.new("word", word), TermInfo.new(1, 1000 + i, 1000 + i, 0))
82
+ end
83
+
84
+ tiw.close()
85
+ tir = TermInfosReader.new(@dir, TEST_SEGMENT+"G", fis)
86
+ assert_equal(564, tir.size)
87
+ assert_equal(16, tir.skip_interval)
88
+ assert_equal(561, tir.get_terms_position(Term.new("word", "duvetyne")))
89
+ assert_equal(TermInfo.new(1, 1005, 1005, 0), tir.get_term_info(term_dualize))
90
+ assert_equal(TermInfo.new(1, 70, 70, 0), tir.get_term_info(term_rev_dualize))
91
+ end
92
+
93
+ def test_io
94
+ term_dumbly = Term.new("word", "dumbly")
95
+ term_dualize = Term.new("word", "dualize")
96
+
97
+ fis = FieldInfos.new
98
+ fis.add("word", true, true)
99
+ terms = []
100
+ term_infos = []
101
+ tiw = TermInfosWriter.new(@dir, TEST_SEGMENT, fis, 128)
102
+ DICT.each_with_index do |word, i|
103
+ terms << Term.new("word", word)
104
+ term_infos << TermInfo.new(1, i, i, 0)
105
+ tiw.add(terms[i], term_infos[i])
106
+ end
107
+ tiw.close()
108
+ tir = TermInfosReader.new(@dir, TEST_SEGMENT, fis)
109
+ assert_equal(282, tir.size)
110
+ assert_equal(16, tir.skip_interval)
111
+ assert_equal(281, tir.get_terms_position(Term.new("word", "duyker")))
112
+ assert_equal(279, tir.get_terms_position(Term.new("word", "duvetyne")))
113
+ assert_equal(254, tir.get_terms_position(Term.new("word", "dusting")))
114
+ assert_equal(255, tir.get_terms_position(Term.new("word", "dustless")))
115
+ assert_equal(256, tir.get_terms_position(Term.new("word", "dustman")))
116
+ assert_equal(257, tir.get_terms_position(Term.new("word", "dustmop")))
117
+ assert_equal(TermInfo.new(1, 5, 5, 0), tir.get_term_info(term_dualize))
118
+ assert_equal(term_dumbly, tir.get_term(127))
119
+ terms = tir.terms_from(term_dumbly)
120
+ assert_equal(term_dumbly, terms.term)
121
+ assert(terms.next?)
122
+ assert_equal(Term.new("word", "dumbness"), terms.term)
123
+ assert(terms.next?)
124
+ assert_equal(Term.new("word", "dumbo"), terms.term)
125
+ end
126
+
127
+ def test_small_writer
128
+ fis = FieldInfos.new
129
+ fis.add("author", true, true)
130
+ fis.add("title", true, true)
131
+ tiw = TermInfosWriter.new(@dir, TEST_SEGMENT, fis, 128)
132
+ terms = [ Term.new("author", "Martel"),
133
+ Term.new("title", "Life of Pi"),
134
+ Term.new("author", "Martin"),
135
+ Term.new("title", "Life on the edge") ].sort
136
+ term_infos = []
137
+ 4.times {|i| term_infos << TermInfo.new(i,i,i,i)}
138
+ 4.times {|i| tiw.add(terms[i], term_infos[i]) }
139
+ tiw.close()
140
+
141
+ tis_file = @dir.open_input(TEST_SEGMENT + ".tis")
142
+ tii_file = @dir.open_input(TEST_SEGMENT + ".tii")
143
+ assert_equal(TermInfosWriter::FORMAT, tis_file.read_int())
144
+ assert_equal(4, tis_file.read_long()) # term count
145
+ assert_equal(128, tis_file.read_int()) # @index_interval
146
+ assert_equal(16, tis_file.read_int()) # @skip_interval
147
+ assert_equal(0, tis_file.read_vint()) # string_equal length
148
+ assert_equal(6, tis_file.read_vint()) # rest of string length
149
+ tis_file.read_chars(author = "", 0, 6) # the difference string
150
+ assert_equal("Martel", author.to_s)
151
+ assert_equal(0, tis_file.read_vint()) # field number
152
+ assert_equal(0, tis_file.read_vint()) # doc_freq
153
+ assert_equal(0, tis_file.read_vlong()) # freq pointer difference
154
+ assert_equal(0, tis_file.read_vlong()) # prox pointer difference
155
+ assert_equal(4, tis_file.read_vint()) # string_equal length
156
+ assert_equal(2, tis_file.read_vint()) # rest of string length
157
+ tis_file.read_chars(author = "", 0, 2) # the difference string
158
+ assert_equal("in", author.to_s)
159
+ assert_equal(0, tis_file.read_vint()) # field number
160
+ assert_equal(1, tis_file.read_vint()) # doc_freq
161
+ assert_equal(1, tis_file.read_vlong()) # freq pointer difference
162
+ assert_equal(1, tis_file.read_vlong()) # prox pointer difference
163
+ assert_equal(0, tis_file.read_vint()) # string_equal length
164
+ assert_equal(10, tis_file.read_vint()) # rest of string length
165
+ tis_file.read_chars(title = "", 0, 10) # the difference string
166
+ assert_equal("Life of Pi", title.to_s)
167
+ assert_equal(1, tis_file.read_vint()) # field number
168
+ assert_equal(2, tis_file.read_vint()) # doc_freq
169
+ assert_equal(1, tis_file.read_vlong()) # freq pointer difference
170
+ assert_equal(1, tis_file.read_vlong()) # prox pointer difference
171
+ assert_equal(6, tis_file.read_vint()) # string_equal length
172
+ assert_equal(10, tis_file.read_vint()) # rest of string length
173
+ tis_file.read_chars(title = "", 0, 10) # the difference string
174
+ assert_equal("n the edge", title.to_s)
175
+ assert_equal(1, tis_file.read_vint()) # field number
176
+ assert_equal(3, tis_file.read_vint()) # doc_freq
177
+ assert_equal(1, tis_file.read_vlong()) # freq pointer difference
178
+ assert_equal(1, tis_file.read_vlong()) # prox pointer difference
179
+
180
+ assert_equal(TermInfosWriter::FORMAT, tii_file.read_int())
181
+ assert_equal(1, tii_file.read_long())
182
+ assert_equal(128, tii_file.read_int())
183
+ assert_equal(16, tii_file.read_int())
184
+ assert_equal(0, tii_file.read_vint()) # string_equal length
185
+ assert_equal(0, tii_file.read_vint()) # rest of string length
186
+ assert_equal(0xFFFFFFFF, tii_file.read_vint()) # field number
187
+ assert_equal(0, tii_file.read_vint()) # doc_freq
188
+ assert_equal(0, tii_file.read_vlong()) # freq pointer difference
189
+ assert_equal(0, tii_file.read_vlong()) # prox pointer difference
190
+ assert_equal(20, tii_file.read_vlong()) # pointer to first element in other
191
+ end
192
+ end
@@ -0,0 +1,18 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class TermVectorOffsetInfoTest < Test::Unit::TestCase
5
+ include Ferret::Index
6
+ def test_tvoi()
7
+ t1 = TermVectorOffsetInfo.new(1, 3)
8
+ assert_equal(t1.start_offset, 1)
9
+ assert_equal(t1.end_offset, 3)
10
+ t2 = TermVectorOffsetInfo.new(1, 3)
11
+ assert(t1 == t2)
12
+ t2.start_offset = 2
13
+ assert(t1 != t2)
14
+ t2.start_offset = 1
15
+ t2.end_offset = 1
16
+ assert(t1 != t2)
17
+ end
18
+ end
@@ -0,0 +1,108 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ class TermVectorsIOTest < Test::Unit::TestCase
5
+
6
+ include Ferret::Index
7
+
8
+ def setup()
9
+ @dir = Ferret::Store::RAMDirectory.new
10
+ @fis = FieldInfos.new
11
+ @fis.add("field1", true, true, true, true)
12
+ @fis.add("field2", true, true)
13
+ end
14
+
15
+ def tear_down()
16
+ @dir.close()
17
+ end
18
+
19
+ def test_tv_io_add_fields()
20
+ tv_w = TermVectorsWriter.new(@dir, "_test", @fis)
21
+ tv_w.open_document
22
+ assert(tv_w.document_open?)
23
+ tv_w.open_field("field1")
24
+ tv_w.add_term("text1", 1, [1], [t(0,4)])
25
+ tv_w.add_term("text2", 2, [3,4], [t(5,10), t(11,16)])
26
+ tv_w.close_field()
27
+ tv_w.close_document()
28
+ tv_w.close()
29
+
30
+ tv_r = TermVectorsReader.new(@dir, "_test", @fis)
31
+ assert_equal(1, tv_r.size)
32
+ tv = tv_r.get_field_tv(0, "field1")
33
+
34
+ assert_equal(2, tv.size)
35
+ assert_equal("text1", tv.terms[0])
36
+ assert_equal(1, tv.term_frequencies[0])
37
+ assert_equal(1, tv.positions[0][0])
38
+ assert_equal(t(0,4), tv.offsets[0][0])
39
+
40
+ assert_equal("text2", tv.terms[1])
41
+ assert_equal(2, tv.term_frequencies[1])
42
+ assert_equal(3, tv.positions[1][0])
43
+ assert_equal(t(5,10), tv.offsets[1][0])
44
+ assert_equal(4, tv.positions[1][1])
45
+ assert_equal(t(11,16), tv.offsets[1][1])
46
+ tv_r.close
47
+ end
48
+
49
+ def test_tv_io_add_documents()
50
+ tvs1 = []
51
+ tvs2 = []
52
+ tv = SegmentTermVector.new("field1",
53
+ ["word1", "word2"],
54
+ [3, 2],
55
+ [[1, 5, 8], [2, 9]],
56
+ [[t(0,5), t(34,39), t(45,50)],[t(6,11), t(51,56)]])
57
+ tvs1 << tv
58
+ tv = SegmentTermVector.new("field2",
59
+ ["word3", "word4"],
60
+ [1, 5],
61
+ [[8], [2, 9, 11, 34, 56]],
62
+ [[t(45,50)], [t(6,10), t(51,56), t(64,69), t(103,108), t(183,188)]])
63
+ tvs1 << tv
64
+ tv_w = TermVectorsWriter.new(@dir, "_test", @fis)
65
+ tv = SegmentTermVector.new("field1",
66
+ ["word1", "word2"],
67
+ [3, 2],
68
+ [[1, 5, 8], [2, 9]],
69
+ [[t(0,5), t(34,39), t(45,50)],[t(6,11), t(51,56)]])
70
+ tvs2 << tv
71
+ tv_w.add_all_doc_vectors(tvs1)
72
+ tv_w.add_all_doc_vectors(tvs2)
73
+ tv_w.close
74
+ tv_r = TermVectorsReader.new(@dir, "_test", @fis)
75
+ assert_equal(2, tv_r.size)
76
+ tv = tv_r.get_field_tv(0, "field1")
77
+
78
+ assert_equal(2, tv.size)
79
+ assert_equal("word1", tv.terms[0])
80
+ assert_equal(3, tv.term_frequencies[0])
81
+ assert_equal(1, tv.positions[0][0])
82
+ assert_equal(5, tv.positions[0][1])
83
+ assert_equal(8, tv.positions[0][2])
84
+ assert_equal(t(0,5), tv.offsets[0][0])
85
+ assert_equal(t(34,39), tv.offsets[0][1])
86
+ assert_equal(t(45,50), tv.offsets[0][2])
87
+
88
+ assert_equal("word2", tv.terms[1])
89
+ assert_equal(2, tv.term_frequencies[1])
90
+ assert_equal(2, tv.positions[1][0])
91
+ assert_equal(9, tv.positions[1][1])
92
+ assert_equal(t(6,11), tv.offsets[1][0])
93
+ assert_equal(t(51,56), tv.offsets[1][1])
94
+
95
+ tv = tv_r.get_field_tv(0, "field2")
96
+ assert_equal(2, tv.size)
97
+ assert_equal("word3", tv.terms[0])
98
+
99
+ tv = tv_r.get_field_tv(1, "field1")
100
+ assert_equal(2, tv.size)
101
+ assert_equal("word1", tv.terms[0])
102
+ end
103
+
104
+ private
105
+ def t(start, finish)
106
+ return TermVectorOffsetInfo.new(start, finish)
107
+ end
108
+ end
@@ -0,0 +1,244 @@
1
+ require File.dirname(__FILE__) + "/../../test_helper"
2
+
3
+
4
+ module IndexTestHelper
5
+ include Ferret::Document
6
+ include Ferret::Index
7
+ include Ferret::Analysis
8
+ include Ferret::Search
9
+
10
+ def IndexTestHelper.make_binary(size)
11
+ tmp = Array.new(size)
12
+ size.times {|i| tmp[i] = i%256 }
13
+ return tmp.pack("c*")
14
+ end
15
+
16
+ BINARY_DATA = IndexTestHelper.make_binary(256)
17
+ COMPRESSED_BINARY_DATA = IndexTestHelper.make_binary(56)
18
+
19
+ def IndexTestHelper.prepare_document
20
+ doc = Document.new()
21
+
22
+ doc << Field.new("text_field1", "field one text", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::NO)
23
+ doc << Field.new("text_field2", "field field field two text", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
24
+ doc << Field.new("key_field", "keyword", Field::Store::YES, Field::Index::UNTOKENIZED)
25
+ doc << Field.new("unindexed_field", "unindexed field text", Field::Store::YES, Field::Index::NO)
26
+ doc << Field.new("unstored_field1", "unstored field text one", Field::Store::NO, Field::Index::TOKENIZED, Field::TermVector::NO)
27
+ doc << Field.new("unstored_field2", "unstored field text two", Field::Store::NO, Field::Index::TOKENIZED, Field::TermVector::YES)
28
+ doc << Field.new("compressed_field", "compressed text", Field::Store::COMPRESS, Field::Index::TOKENIZED, Field::TermVector::YES)
29
+ doc << Field.new_binary_field("binary_field", BINARY_DATA, Field::Store::YES)
30
+ doc << Field.new_binary_field("compressed_binary_field", COMPRESSED_BINARY_DATA, Field::Store::COMPRESS)
31
+ return doc
32
+ end
33
+
34
+ def IndexTestHelper.prepare_documents
35
+ data = [
36
+ ["apple", "green"],
37
+ ["apple", "red"],
38
+ ["orange", "orange"],
39
+ ["grape", "green"],
40
+ ["grape", "purple"],
41
+ ["mandarin", "orange"],
42
+ ["peach", "orange"],
43
+ ["apricot", "orange"]
44
+ ]
45
+
46
+ docs = []
47
+
48
+ data.each do |food|
49
+ doc = Document.new()
50
+ doc << Field.new("name", food[0], Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
51
+ doc << Field.new("colour", food[1], Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
52
+ docs << doc
53
+ end
54
+ return docs
55
+ end
56
+
57
+ def IndexTestHelper.write_document(dir, doc, segment="test", analyzer = WhiteSpaceAnalyzer.new(), similarity = Similarity.default())
58
+ writer = DocumentWriter.new(dir, analyzer, similarity, 50)
59
+ writer.add_document(segment, doc)
60
+ end
61
+
62
+ def IndexTestHelper.prepare_book_list
63
+ books = [
64
+ {"author" => "P.H. Newby", "title" => "Something To Answer For", "year" => "1969"},
65
+ {"author" => "Bernice Rubens", "title" => "The Elected Member", "year" => "1970"},
66
+ {"author" => "V. S. Naipaul", "title" => "In a Free State", "year" => "1971"},
67
+ {"author" => "John Berger", "title" => "G", "year" => "1972"},
68
+ {"author" => "J. G. Farrell", "title" => "The Siege of Krishnapur", "year" => "1973"},
69
+ {"author" => "Stanley Middleton", "title" => "Holiday", "year" => "1974"},
70
+ {"author" => "Nadine Gordimer", "title" => "The Conservationist", "year" => "1974"},
71
+ {"author" => "Ruth Prawer Jhabvala", "title" => "Heat and Dust", "year" => "1975"},
72
+ {"author" => "David Storey", "title" => "Saville", "year" => "1976"},
73
+ {"author" => "Paul Scott", "title" => "Staying On", "year" => "1977"},
74
+ {"author" => "Iris Murdoch", "title" => "The Sea", "year" => "1978"},
75
+ {"author" => "Penelope Fitzgerald", "title" => "Offshore", "year" => "1979"},
76
+ {"author" => "William Golding", "title" => "Rites of Passage", "year" => "1980"},
77
+ {"author" => "Salman Rushdie", "title" => "Midnight's Children", "year" => "1981"},
78
+ {"author" => "Thomas Keneally", "title" => "Schindler's Ark", "year" => "1982"},
79
+ {"author" => "J. M. Coetzee", "title" => "Life and Times of Michael K", "year" => "1983"},
80
+ {"author" => "Anita Brookner", "title" => "Hotel du Lac", "year" => "1984"},
81
+ {"author" => "Keri Hulme", "title" => "The Bone People", "year" => "1985"},
82
+ {"author" => "Kingsley Amis", "title" => "The Old Devils", "year" => "1986"},
83
+ {"author" => "Penelope Lively", "title" => "Moon Tiger", "year" => "1987"},
84
+ {"author" => "Peter Carey", "title" => "Oscar and Lucinda", "year" => "1988"},
85
+ {"author" => "Kazuo Ishiguro", "title" => "The Remains of the Day", "year" => "1989"},
86
+ {"author" => "A. S. Byatt", "title" => "Possession", "year" => "1990"},
87
+ {"author" => "Ben Okri", "title" => "The Famished Road", "year" => "1991"},
88
+ {"author" => "Michael Ondaatje", "title" => "The English Patient", "year" => "1992"},
89
+ {"author" => "Barry Unsworth", "title" => "Sacred Hunger", "year" => "1992"},
90
+ {"author" => "Roddy Doyle", "title" => "Paddy Clarke Ha Ha Ha", "year" => "1993"},
91
+ {"author" => "James Kelman", "title" => "How Late It Was, How Late", "year" => "1994"},
92
+ {"author" => "Pat Barker", "title" => "The Ghost Road", "year" => "1995"},
93
+ {"author" => "Graham Swift", "title" => "Last Orders", "year" => "1996"},
94
+ {"author" => "Arundati Roy", "title" => "The God of Small Things", "year" => "1997"},
95
+ {"author" => "Ian McEwan", "title" => "Amsterdam", "year" => "1998"},
96
+ {"author" => "J. M. Coetzee", "title" => "Disgrace", "year" => "1999"},
97
+ {"author" => "Margaret Atwood", "title" => "The Blind Assassin", "year" => "2000"},
98
+ {"author" => "Peter Carey", "title" => "True History of the Kelly Gang", "year" => "2001"},
99
+ {"author" => "Yann Martel", "title" => "The Life of Pi", "year" => "2002"},
100
+ {"author" => "DBC Pierre", "title" => "Vernon God Little", "year" => "2003"}
101
+ ]
102
+ docs = []
103
+
104
+ books.each do |book|
105
+ doc = Document.new()
106
+ doc << Field.new("author", book["author"], Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
107
+ doc << Field.new("title", book["title"], Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
108
+ doc << Field.new("year", book["year"], Field::Store::YES, Field::Index::NO, Field::TermVector::NO)
109
+ docs << doc
110
+ end
111
+ return docs
112
+ end
113
+
114
+ IR_TEST_DOC_CNT = 64
115
+
116
+ def IndexTestHelper.prepare_ir_test_docs()
117
+ body = "body"
118
+ title = "title"
119
+ author = "author"
120
+ text = "text"
121
+ year = "year"
122
+ changing_field = "changing_field"
123
+
124
+ docs = Array.new(IR_TEST_DOC_CNT)
125
+ docs[0] = Document.new()
126
+ docs[0] << Field.new(body, "Where is Wally", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
127
+ docs[0] << Field.new(changing_field, "word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::NO)
128
+ docs[1] = Document.new()
129
+ docs[1] << Field.new(body, "Some Random Sentence read", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
130
+ docs[2] = Document.new()
131
+ docs[2] << Field.new(body, "Some read Random Sentence read", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
132
+ docs[3] = Document.new()
133
+ docs[3] << Field.new(title, "War And Peace", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_OFFSETS)
134
+ docs[3] << Field.new(body, "word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
135
+ docs[3] << Field.new(author, "Leo Tolstoy", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS)
136
+ docs[3] << Field.new(year, "1865", Field::Store::YES, Field::Index::NO, Field::TermVector::NO)
137
+ docs[3] << Field.new(text, "more text which is not stored", Field::Store::NO, Field::Index::TOKENIZED, Field::TermVector::NO)
138
+ docs[4] = Document.new()
139
+ docs[4] << Field.new(body, "Some Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
140
+ docs[5] = Document.new()
141
+ docs[5] << Field.new(body, "Here's Wally", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
142
+ docs[6] = Document.new()
143
+ docs[6] << Field.new(body, "Some Random Sentence read read read read", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
144
+ docs[7] = Document.new()
145
+ docs[7] << Field.new(body, "Some Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
146
+ docs[8] = Document.new()
147
+ docs[8] << Field.new(body, "Some Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
148
+ docs[9] = Document.new()
149
+ docs[9] << Field.new(body, "read Some Random Sentence read this will be used after unfinished next position read", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
150
+ docs[10] = Document.new()
151
+ docs[10] << Field.new(body, "Some read Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
152
+ docs[10] << Field.new(changing_field, "word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::YES)
153
+ docs[11] = Document.new()
154
+ docs[11] << Field.new(body, "And here too. Well, maybe Not", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
155
+ docs[12] = Document.new()
156
+ docs[12] << Field.new(body, "Some Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
157
+ docs[13] = Document.new()
158
+ docs[13] << Field.new(body, "Some Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
159
+ docs[14] = Document.new()
160
+ docs[14] << Field.new(body, "Some Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
161
+ docs[15] = Document.new()
162
+ docs[15] << Field.new(body, "Some read Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
163
+ docs[16] = Document.new()
164
+ docs[16] << Field.new(body, "Some Random read read Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
165
+ docs[17] = Document.new()
166
+ docs[17] << Field.new(body, "Some Random read Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
167
+ docs[17] << Field.new(changing_field, "word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS)
168
+ docs[18] = Document.new()
169
+ docs[18] << Field.new(body, "Wally Wally Wally", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
170
+ docs[19] = Document.new()
171
+ docs[19] << Field.new(body, "Some Random Sentence", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
172
+ docs[19] << Field.new(changing_field, "word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_OFFSETS)
173
+ docs[20] = Document.new()
174
+ docs[20] << Field.new(body, "Wally is where Wally usually likes to go. Wally Mart! Wally likes shopping there for Where's Wally books. Wally likes to read", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
175
+ docs[20] << Field.new(changing_field, "word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
176
+ docs[21] = Document.new()
177
+ docs[21] << Field.new(body, "Some Random Sentence read read read and more read read read", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
178
+ docs[21] << Field.new(changing_field, "word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::NO)
179
+
180
+ buf = ""
181
+ 21.times { buf << "skip " }
182
+ 22.upto(IR_TEST_DOC_CNT) do |i|
183
+ buf << "skip "
184
+ docs[i] = Document.new()
185
+ docs[i] << Field.new(text, buf.clone, Field::Store::NO, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
186
+ end
187
+ return docs
188
+ end
189
+
190
+ def IndexTestHelper.prepare_search_docs
191
+ data = [
192
+ {"date" => "20050930", "field" => "word1",
193
+ "cat" => "cat1/"},
194
+ {"date" => "20051001", "field" => "word1 word2 the quick brown fox",
195
+ "cat" => "cat1/sub1"},
196
+ {"date" => "20051002", "field" => "word1 word3",
197
+ "cat" => "cat1/sub1/subsub1"},
198
+ {"date" => "20051003", "field" => "word1 word3",
199
+ "cat" => "cat1/sub2"},
200
+ {"date" => "20051004", "field" => "word1 word2",
201
+ "cat" => "cat1/sub2/subsub2"},
202
+ {"date" => "20051005", "field" => "word1",
203
+ "cat" => "cat2/sub1"},
204
+ {"date" => "20051006", "field" => "word1 word3",
205
+ "cat" => "cat2/sub1"},
206
+ {"date" => "20051007", "field" => "word1",
207
+ "cat" => "cat2/sub1"},
208
+ {"date" => "20051008", "field" => "word1 word2 word3 the fast brown fox",
209
+ "cat" => "cat2/sub1"},
210
+ {"date" => "20051009", "field" => "word1",
211
+ "cat" => "cat3/sub1"},
212
+ {"date" => "20051010", "field" => "word1",
213
+ "cat" => "cat3/sub1"},
214
+ {"date" => "20051011", "field" => "word1 word3 the quick red fox",
215
+ "cat" => "cat3/sub1"},
216
+ {"date" => "20051012", "field" => "word1",
217
+ "cat" => "cat3/sub1"},
218
+ {"date" => "20051013", "field" => "word1",
219
+ "cat" => "cat1/sub2"},
220
+ {"date" => "20051014", "field" => "word1 word3 the quick hairy fox",
221
+ "cat" => "cat1/sub1"},
222
+ {"date" => "20051015", "field" => "word1",
223
+ "cat" => "cat1/sub2/subsub1"},
224
+ {"date" => "20051016",
225
+ "field" => "word1 the quick fox is brown and hairy and a little red",
226
+ "cat" => "cat1/sub1/subsub2"},
227
+ {"date" => "20051017", "field" => "word1 the brown fox is quick and red",
228
+ "cat" => "cat1/"}
229
+ ]
230
+
231
+ docs = []
232
+ data.each_with_index do |fields, i|
233
+ doc = Document.new()
234
+ doc.boost = i+1
235
+
236
+ fields.each_pair do |field, text|
237
+ doc << Field.new(field, text, Field::Store::NO, Field::Index::TOKENIZED, Field::TermVector::NO, i+1)
238
+ end
239
+ docs << doc
240
+ end
241
+ return docs
242
+ end
243
+ end
244
+