ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,130 @@
1
+ module Ferret
2
+ module Index
3
+ # Holds the info for one segment.
4
+ #
5
+ # ToDo: Does the dir really need to be stored here?
6
+ class SegmentInfo
7
+ attr_accessor :name, :doc_count, :directory
8
+
9
+ def initialize(name, doc_count, dir)
10
+ @name = name
11
+ @doc_count = doc_count
12
+ @directory = dir
13
+ end
14
+
15
+ def ==(o)
16
+ (o.name == @name and o.doc_count == @doc_count)
17
+ end
18
+ end
19
+
20
+ class SegmentInfos < Array
21
+ # for compatability with Java Ferret files
22
+ FORMAT = -1
23
+ SEGMENT_FILENAME = "segments"
24
+ TEMPORARY_SEGMENT_FILENAME = "segments.new"
25
+
26
+ attr_reader :version # counts how often the index has been modified
27
+ # by adding or deleting docs
28
+ attr_accessor :counter # used to name new segments??
29
+
30
+ # Current version number from segments file.
31
+ def SegmentInfos.read_current_version(directory)
32
+ return 0 if not directory.exists?(SEGMENT_FILENAME)
33
+ input = directory.open_input(SEGMENT_FILENAME)
34
+ @format = 0
35
+ @version = 0
36
+ begin
37
+ @format = input.read_int()
38
+ if(@format < 0)
39
+ if (@format < FORMAT) then raise "Unknown format version: " + @format end
40
+ @version = input.read_long() # read version
41
+ end
42
+ ensure
43
+ input.close()
44
+ end
45
+
46
+ if(@format < 0)
47
+ return @version
48
+ end
49
+
50
+ # We cannot be sure about the format of the file.
51
+ # Therefore we have to read the whole file and cannot simply
52
+ # seek to the version entry.
53
+
54
+ sis = SegmentInfos.new()
55
+ sis.read(directory)
56
+ return sis.version()
57
+ end
58
+
59
+ def initialize()
60
+ @version = Time.now.to_i * 1000
61
+ @counter = 0
62
+ end
63
+
64
+ def clone
65
+ clone = self.clone
66
+ self.each_index {|i| clone[i] = self[i].clone}
67
+ end
68
+
69
+ def read(directory)
70
+ input = directory.open_input(SEGMENT_FILENAME)
71
+ begin
72
+ @format = input.read_int()
73
+ if(@format < 0) # file contains explicit format info
74
+ # check that it is a format we can understand
75
+ if (@format < FORMAT) then raise "Unknown format version: " + @format end
76
+ @version = input.read_long()
77
+ @counter = input.read_int()
78
+ else # file is in old format without explicit format info
79
+ @counter = @format
80
+ end
81
+
82
+ seg_count = input.read_int()
83
+ seg_count.times do
84
+ self << SegmentInfo.new(input.read_string(),
85
+ input.read_int(),
86
+ directory)
87
+ end
88
+
89
+ if(@format >= 0)
90
+ # in old format the version number may be at the end of the file
91
+ if (input.pos() >= input.length())
92
+ @version = 0 # old file format without version number
93
+ else
94
+ @version = input.read_long() # read version
95
+ end
96
+ end
97
+ ensure
98
+ input.close()
99
+ end
100
+ end
101
+
102
+ def write(directory)
103
+ output = directory.create_output(TEMPORARY_SEGMENT_FILENAME)
104
+ begin
105
+ output.write_int(FORMAT) # write FORMAT
106
+ output.write_long(@version += 1) # every write changes the index
107
+ output.write_int(@counter) # write counter
108
+ output.write_int(size()) # write infos
109
+ each() do |si|
110
+ output.write_string(si.name)
111
+ output.write_int(si.doc_count)
112
+ end
113
+
114
+ ensure
115
+ output.close()
116
+ end
117
+
118
+ # install new segment info
119
+ directory.rename(TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME)
120
+ end
121
+
122
+ def to_s()
123
+ str = "\nSegmentInfos: <"
124
+ each() { |si| str << "#{si.name}:#{si.doc_count}," }
125
+ str[-1] = ">"
126
+ str
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,47 @@
1
+ module Ferret
2
+ module Index
3
+ class SegmentMergeInfo
4
+ attr_reader :term, :term_enum, :reader, :postings, :doc_map, :base
5
+
6
+ def initialize(base, term_enum, reader)
7
+ @base = base
8
+ @reader = reader
9
+ @term_enum = term_enum
10
+ @term = term_enum.term()
11
+ @postings = @reader.term_positions()
12
+
13
+ # build array which maps document numbers around deletions
14
+ if (@reader.has_deletions?())
15
+ max_doc = @reader.max_doc()
16
+ @doc_map = Array.new(max_doc)
17
+ j = 0
18
+ max_doc.times do |i|
19
+ if (@reader.deleted?(i))
20
+ @doc_map[i] = -1
21
+ else
22
+ @doc_map[i] = j
23
+ j += 1
24
+ end
25
+ end
26
+ end
27
+ end
28
+
29
+ def next?
30
+ if @term_enum.next?
31
+ @term = @term_enum.term
32
+ return true
33
+ else
34
+ @term = nil
35
+ return false
36
+ end
37
+ end
38
+
39
+ def close()
40
+ @term_enum.close()
41
+ @postings.close()
42
+ @reader = nil
43
+ end
44
+ end
45
+ end
46
+ end
47
+
@@ -0,0 +1,16 @@
1
+ module Ferret::Index
2
+ class SegmentMergeQueue < Ferret::Utils::PriorityQueue
3
+ def less_than(sti_a, sti_b)
4
+ if sti_a.term == sti_b.term
5
+ return sti_a.base < sti_b.base
6
+ else
7
+ return sti_a.term < sti_b.term
8
+ end
9
+ end
10
+
11
+ def close()
12
+ @heap.each {|sti| sti.close if sti}
13
+ clear
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,337 @@
1
+ module Ferret::Index
2
+
3
+ # The SegmentMerger class combines two or more Segments, represented by
4
+ # an IndexReader#add, into a single Segment. After adding the
5
+ # appropriate readers, call the merge method to combine the segments.
6
+ #
7
+ # If the compoundFile flag is set, then the segments will be merged
8
+ # into a compound file.
9
+ class SegmentMerger
10
+
11
+ # dir:: The Directory to merge the other segments into
12
+ # name:: The name of the new segment
13
+ def initialize(dir, name,
14
+ term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
15
+ @directory = dir
16
+ @segment = name
17
+ @term_index_interval = term_index_interval
18
+ @readers = []
19
+ @field_infos = nil
20
+ @freq_output = nil
21
+ @prox_output = nil
22
+ @term_infos_writer = nil
23
+ @queue = nil
24
+ @term_info = TermInfo.new()
25
+ @skip_buffer = Ferret::Store::RAMDirectory::RAMIndexOutput.new(
26
+ Ferret::Store::RAMDirectory::RAMFile.new(""))
27
+ end
28
+
29
+ # Add an IndexReader to the collection of readers that are to be merged
30
+ # reader::
31
+ def add(reader)
32
+ @readers << reader
33
+ end
34
+
35
+ #
36
+ # i:: The index of the reader to return
37
+ # returns:: The ith reader to be merged
38
+ def segment_reader(i)
39
+ return @readers[i]
40
+ end
41
+
42
+ # Merges the readers specified by the @link #addendmethod into the directory passed to the constructor
43
+ # returns:: The number of documents that were merged
44
+ # raises:: IOError
45
+ def merge()
46
+ value = merge_fields()
47
+ merge_terms()
48
+ merge_norms()
49
+ merge_vectors() if @field_infos.has_vectors?
50
+ return value
51
+ end
52
+
53
+ # close all IndexReaders that have been added.
54
+ # Should not be called before merge().
55
+ # raises:: IOError
56
+ def close_readers()
57
+ @readers.each { |reader| reader.close }
58
+ end
59
+
60
+ def create_compound_file(file_name)
61
+
62
+ cfs_writer = CompoundFileWriter.new(@directory, file_name)
63
+
64
+ files = Array.new(IndexFileNames::COMPOUND_EXTENSIONS.length + @field_infos.size)
65
+
66
+ # Basic files
67
+ IndexFileNames::COMPOUND_EXTENSIONS.times do |i|
68
+ files << @segment + "." + IndexFileNames::COMPOUND_EXTENSIONS[i]
69
+ end
70
+
71
+ # Field norm files
72
+ @field_infos.each_with_index do |fi, i|
73
+ if (fi.indexed?)
74
+ files << @segment + ".f#{i}"
75
+ end
76
+ end
77
+
78
+ # Vector files
79
+ if @field_infos.has_vectors?
80
+ IndexFileNames::VECTOR_EXTENSIONS.length.times do |i|
81
+ files << @segment + "." + IndexFileNames::VECTOR_EXTENSIONS[i]
82
+ end
83
+ end
84
+
85
+ # Now merge all added files
86
+ files.each do |file|
87
+ cfs_writer.add_file(file)
88
+ end
89
+
90
+ # Perform the merge
91
+ cfs_writer.close
92
+
93
+ return files
94
+ end
95
+
96
+ #
97
+ # returns:: The number of documents in all of the readers
98
+ # raises:: IOError
99
+ def merge_fields()
100
+ @field_infos = FieldInfos.new() # merge field names
101
+ doc_count = 0
102
+ @readers.each do |reader|
103
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET), true, true, true, true)
104
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION), true, true, true, false)
105
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET), true, true, false, true)
106
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR), true, true, false, false)
107
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::INDEXED), true, false, false, false)
108
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::UNINDEXED), false)
109
+ end
110
+ @field_infos.write_to_dir(@directory, @segment + ".fnm")
111
+
112
+ # merge field values
113
+ fields_writer = FieldsWriter.new(@directory, @segment, @field_infos)
114
+
115
+ begin
116
+ @readers.each do |reader|
117
+ max_doc = reader.max_doc()
118
+ max_doc.times do |j|
119
+ if not reader.deleted?(j) # skip deleted docs
120
+ fields_writer.add_document(reader.get_document(j))
121
+ doc_count += 1
122
+ end
123
+ end
124
+ end
125
+ ensure
126
+ fields_writer.close()
127
+ end
128
+ return doc_count
129
+ end
130
+
131
+ # Merge the TermVectors from each of the segments into the new one.
132
+ # raises:: IOError
133
+ def merge_vectors()
134
+ term_vectors_writer = TermVectorsWriter.new(@directory, @segment, @field_infos)
135
+
136
+ begin
137
+ @readers.each do |reader|
138
+ max_doc = reader.max_doc()
139
+ max_doc.times do |doc_num|
140
+ # skip deleted docs
141
+ next if (reader.deleted?(doc_num))
142
+ term_vectors_writer.add_all_doc_vectors(reader.get_term_vectors(doc_num))
143
+ end
144
+ end
145
+ ensure
146
+ term_vectors_writer.close()
147
+ end
148
+ end
149
+
150
+ def merge_terms()
151
+ begin
152
+ @freq_output = @directory.create_output(@segment + ".frq")
153
+ @prox_output = @directory.create_output(@segment + ".prx")
154
+ @term_infos_writer =
155
+ TermInfosWriter.new(@directory, @segment, @field_infos,
156
+ @term_index_interval)
157
+ @skip_interval = @term_infos_writer.skip_interval
158
+ @queue = SegmentMergeQueue.new(@readers.size())
159
+
160
+ merge_term_infos()
161
+
162
+ ensure
163
+ [@freq_output, @prox_output, @term_infos_writer, @queue].each do |obj|
164
+ obj.close()
165
+ end
166
+ end
167
+ end
168
+
169
+ def merge_term_infos()
170
+ base = 0
171
+ @readers.each do |reader|
172
+ term_enum = reader.terms()
173
+ smi = SegmentMergeInfo.new(base, term_enum, reader)
174
+ base += reader.num_docs()
175
+ if (smi.next?)
176
+ @queue.push(smi) # initialize @queue
177
+ else
178
+ smi.close()
179
+ end
180
+ end
181
+
182
+ match = Array.new(@readers.size)
183
+
184
+ while (@queue.size > 0)
185
+ match_size = 0 # pop matching terms
186
+ match[match_size] = @queue.pop
187
+ match_size += 1
188
+ term = match[0].term
189
+ top = @queue.top
190
+
191
+ while top and term == top.term
192
+ match[match_size] = @queue.pop
193
+ match_size += 1
194
+ top = @queue.top
195
+ end
196
+
197
+ merge_term_info(match, match_size) # add new TermInfo
198
+
199
+ while (match_size > 0)
200
+ match_size -= 1
201
+ smi = match[match_size]
202
+ if (smi.next?)
203
+ @queue.push(smi) # restore queue
204
+ else
205
+ smi.close() # done with a segment
206
+ end
207
+ end
208
+ end
209
+ end
210
+
211
+ # Merge one term found in one or more segments. The array <code>smis</code>
212
+ # contains segments that are positioned at the same term. <code>N</code>
213
+ # is the number of cells in the array actually occupied.
214
+ #
215
+ # smis:: array of segments
216
+ # n:: number of cells in the array actually occupied
217
+ def merge_term_info(smis, n)
218
+
219
+ freq_pointer = @freq_output.pos
220
+ prox_pointer = @prox_output.pos
221
+
222
+ df = append_postings(smis, n) # append posting data
223
+
224
+ skip_pointer = write_skip()
225
+
226
+ if (df > 0)
227
+ # add an enbegin to the dictionary with pointers to prox and freq files
228
+ @term_info.set_values!(df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer))
229
+ @term_infos_writer.add(smis[0].term, @term_info)
230
+ end
231
+ end
232
+
233
+ # Process postings from multiple segments all positioned on the
234
+ # same term. Writes out merged entries into @freq_utput and
235
+ # the @prox_output streams.
236
+ #
237
+ # smis:: array of segments
238
+ # n:: number of cells in the array actually occupied
239
+ # returns:: number of documents across all segments where this term was found
240
+ def append_postings(smis, n)
241
+ last_doc = 0
242
+ df = 0 # number of docs w/ term
243
+ reset_skip()
244
+ n.times do |i|
245
+ smi = smis[i]
246
+ postings = smi.postings
247
+ base = smi.base
248
+ doc_map = smi.doc_map
249
+
250
+ postings.seek(smi.term_enum)
251
+ while (postings.next?)
252
+ doc = postings.doc()
253
+ doc = doc_map[doc] if (doc_map != nil) # work around deletions
254
+ doc += base # convert to merged space
255
+
256
+ if (doc < last_doc)
257
+ raise "docs out of order curent doc = " + doc.to_s +
258
+ " and previous doc = " + last_doc.to_s
259
+ end
260
+
261
+ df += 1
262
+
263
+ if ((df % @skip_interval) == 0)
264
+ buffer_skip(last_doc)
265
+ end
266
+
267
+ doc_code = (doc - last_doc) << 1 # use low bit to flag freq=1
268
+ last_doc = doc
269
+
270
+ freq = postings.freq
271
+ if (freq == 1)
272
+ @freq_output.write_vint(doc_code | 1) # write doc & freq=1
273
+ else
274
+ @freq_output.write_vint(doc_code) # write doc
275
+ @freq_output.write_vint(freq) # write frequency in doc
276
+ end
277
+
278
+ last_position = 0 # write position deltas
279
+ freq.times do |j|
280
+ position = postings.next_position()
281
+ @prox_output.write_vint(position - last_position)
282
+ last_position = position
283
+ end
284
+ end
285
+ end
286
+ return df
287
+ end
288
+
289
+ def reset_skip()
290
+ @skip_buffer.reset()
291
+ @last_skip_doc = 0
292
+ @last_skip_freq_pointer = @freq_output.pos
293
+ @last_skip_prox_pointer = @prox_output.pos
294
+ end
295
+
296
+ def buffer_skip(doc)
297
+ freq_pointer = @freq_output.pos
298
+ prox_pointer = @prox_output.pos
299
+
300
+ @skip_buffer.write_vint(doc - @last_skip_doc)
301
+ @skip_buffer.write_vint(freq_pointer - @last_skip_freq_pointer)
302
+ @skip_buffer.write_vint(prox_pointer - @last_skip_prox_pointer)
303
+
304
+ @last_skip_doc = doc
305
+ @last_skip_freq_pointer = freq_pointer
306
+ @last_skip_prox_pointer = prox_pointer
307
+ end
308
+
309
+ def write_skip()
310
+ skip_pointer = @freq_output.pos
311
+ @skip_buffer.write_to(@freq_output)
312
+ return skip_pointer
313
+ end
314
+
315
+ def merge_norms()
316
+ @field_infos.each_with_index do |fi, i|
317
+ if (fi.indexed?)
318
+ output = @directory.create_output(@segment + ".f" + i.to_s)
319
+ begin
320
+ @readers.each do |reader|
321
+ max_doc = reader.max_doc()
322
+ input = "0" * max_doc
323
+ reader.get_norms_into(fi.name, input, 0)
324
+ max_doc.times do |k|
325
+ if not reader.deleted?(k)
326
+ output.write_byte(input[k])
327
+ end
328
+ end
329
+ end
330
+ ensure
331
+ output.close()
332
+ end
333
+ end
334
+ end
335
+ end
336
+ end
337
+ end