ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,130 @@
1
+ module Ferret
2
+ module Index
3
+ # Holds the info for one segment.
4
+ #
5
+ # ToDo: Does the dir really need to be stored here?
6
+ class SegmentInfo
7
+ attr_accessor :name, :doc_count, :directory
8
+
9
+ def initialize(name, doc_count, dir)
10
+ @name = name
11
+ @doc_count = doc_count
12
+ @directory = dir
13
+ end
14
+
15
+ def ==(o)
16
+ (o.name == @name and o.doc_count == @doc_count)
17
+ end
18
+ end
19
+
20
+ class SegmentInfos < Array
21
+ # for compatability with Java Ferret files
22
+ FORMAT = -1
23
+ SEGMENT_FILENAME = "segments"
24
+ TEMPORARY_SEGMENT_FILENAME = "segments.new"
25
+
26
+ attr_reader :version # counts how often the index has been modified
27
+ # by adding or deleting docs
28
+ attr_accessor :counter # used to name new segments??
29
+
30
+ # Current version number from segments file.
31
+ def SegmentInfos.read_current_version(directory)
32
+ return 0 if not directory.exists?(SEGMENT_FILENAME)
33
+ input = directory.open_input(SEGMENT_FILENAME)
34
+ @format = 0
35
+ @version = 0
36
+ begin
37
+ @format = input.read_int()
38
+ if(@format < 0)
39
+ if (@format < FORMAT) then raise "Unknown format version: " + @format end
40
+ @version = input.read_long() # read version
41
+ end
42
+ ensure
43
+ input.close()
44
+ end
45
+
46
+ if(@format < 0)
47
+ return @version
48
+ end
49
+
50
+ # We cannot be sure about the format of the file.
51
+ # Therefore we have to read the whole file and cannot simply
52
+ # seek to the version entry.
53
+
54
+ sis = SegmentInfos.new()
55
+ sis.read(directory)
56
+ return sis.version()
57
+ end
58
+
59
+ def initialize()
60
+ @version = Time.now.to_i * 1000
61
+ @counter = 0
62
+ end
63
+
64
+ def clone
65
+ clone = self.clone
66
+ self.each_index {|i| clone[i] = self[i].clone}
67
+ end
68
+
69
+ def read(directory)
70
+ input = directory.open_input(SEGMENT_FILENAME)
71
+ begin
72
+ @format = input.read_int()
73
+ if(@format < 0) # file contains explicit format info
74
+ # check that it is a format we can understand
75
+ if (@format < FORMAT) then raise "Unknown format version: " + @format end
76
+ @version = input.read_long()
77
+ @counter = input.read_int()
78
+ else # file is in old format without explicit format info
79
+ @counter = @format
80
+ end
81
+
82
+ seg_count = input.read_int()
83
+ seg_count.times do
84
+ self << SegmentInfo.new(input.read_string(),
85
+ input.read_int(),
86
+ directory)
87
+ end
88
+
89
+ if(@format >= 0)
90
+ # in old format the version number may be at the end of the file
91
+ if (input.pos() >= input.length())
92
+ @version = 0 # old file format without version number
93
+ else
94
+ @version = input.read_long() # read version
95
+ end
96
+ end
97
+ ensure
98
+ input.close()
99
+ end
100
+ end
101
+
102
+ def write(directory)
103
+ output = directory.create_output(TEMPORARY_SEGMENT_FILENAME)
104
+ begin
105
+ output.write_int(FORMAT) # write FORMAT
106
+ output.write_long(@version += 1) # every write changes the index
107
+ output.write_int(@counter) # write counter
108
+ output.write_int(size()) # write infos
109
+ each() do |si|
110
+ output.write_string(si.name)
111
+ output.write_int(si.doc_count)
112
+ end
113
+
114
+ ensure
115
+ output.close()
116
+ end
117
+
118
+ # install new segment info
119
+ directory.rename(TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME)
120
+ end
121
+
122
+ def to_s()
123
+ str = "\nSegmentInfos: <"
124
+ each() { |si| str << "#{si.name}:#{si.doc_count}," }
125
+ str[-1] = ">"
126
+ str
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,47 @@
1
+ module Ferret
2
+ module Index
3
+ class SegmentMergeInfo
4
+ attr_reader :term, :term_enum, :reader, :postings, :doc_map, :base
5
+
6
+ def initialize(base, term_enum, reader)
7
+ @base = base
8
+ @reader = reader
9
+ @term_enum = term_enum
10
+ @term = term_enum.term()
11
+ @postings = @reader.term_positions()
12
+
13
+ # build array which maps document numbers around deletions
14
+ if (@reader.has_deletions?())
15
+ max_doc = @reader.max_doc()
16
+ @doc_map = Array.new(max_doc)
17
+ j = 0
18
+ max_doc.times do |i|
19
+ if (@reader.deleted?(i))
20
+ @doc_map[i] = -1
21
+ else
22
+ @doc_map[i] = j
23
+ j += 1
24
+ end
25
+ end
26
+ end
27
+ end
28
+
29
+ def next?
30
+ if @term_enum.next?
31
+ @term = @term_enum.term
32
+ return true
33
+ else
34
+ @term = nil
35
+ return false
36
+ end
37
+ end
38
+
39
+ def close()
40
+ @term_enum.close()
41
+ @postings.close()
42
+ @reader = nil
43
+ end
44
+ end
45
+ end
46
+ end
47
+
@@ -0,0 +1,16 @@
1
+ module Ferret::Index
2
+ class SegmentMergeQueue < Ferret::Utils::PriorityQueue
3
+ def less_than(sti_a, sti_b)
4
+ if sti_a.term == sti_b.term
5
+ return sti_a.base < sti_b.base
6
+ else
7
+ return sti_a.term < sti_b.term
8
+ end
9
+ end
10
+
11
+ def close()
12
+ @heap.each {|sti| sti.close if sti}
13
+ clear
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,337 @@
1
+ module Ferret::Index
2
+
3
+ # The SegmentMerger class combines two or more Segments, represented by
4
+ # an IndexReader#add, into a single Segment. After adding the
5
+ # appropriate readers, call the merge method to combine the segments.
6
+ #
7
+ # If the compoundFile flag is set, then the segments will be merged
8
+ # into a compound file.
9
+ class SegmentMerger
10
+
11
+ # dir:: The Directory to merge the other segments into
12
+ # name:: The name of the new segment
13
+ def initialize(dir, name,
14
+ term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
15
+ @directory = dir
16
+ @segment = name
17
+ @term_index_interval = term_index_interval
18
+ @readers = []
19
+ @field_infos = nil
20
+ @freq_output = nil
21
+ @prox_output = nil
22
+ @term_infos_writer = nil
23
+ @queue = nil
24
+ @term_info = TermInfo.new()
25
+ @skip_buffer = Ferret::Store::RAMDirectory::RAMIndexOutput.new(
26
+ Ferret::Store::RAMDirectory::RAMFile.new(""))
27
+ end
28
+
29
+ # Add an IndexReader to the collection of readers that are to be merged
30
+ # reader::
31
+ def add(reader)
32
+ @readers << reader
33
+ end
34
+
35
+ #
36
+ # i:: The index of the reader to return
37
+ # returns:: The ith reader to be merged
38
+ def segment_reader(i)
39
+ return @readers[i]
40
+ end
41
+
42
+ # Merges the readers specified by the @link #addendmethod into the directory passed to the constructor
43
+ # returns:: The number of documents that were merged
44
+ # raises:: IOError
45
+ def merge()
46
+ value = merge_fields()
47
+ merge_terms()
48
+ merge_norms()
49
+ merge_vectors() if @field_infos.has_vectors?
50
+ return value
51
+ end
52
+
53
+ # close all IndexReaders that have been added.
54
+ # Should not be called before merge().
55
+ # raises:: IOError
56
+ def close_readers()
57
+ @readers.each { |reader| reader.close }
58
+ end
59
+
60
+ def create_compound_file(file_name)
61
+
62
+ cfs_writer = CompoundFileWriter.new(@directory, file_name)
63
+
64
+ files = Array.new(IndexFileNames::COMPOUND_EXTENSIONS.length + @field_infos.size)
65
+
66
+ # Basic files
67
+ IndexFileNames::COMPOUND_EXTENSIONS.times do |i|
68
+ files << @segment + "." + IndexFileNames::COMPOUND_EXTENSIONS[i]
69
+ end
70
+
71
+ # Field norm files
72
+ @field_infos.each_with_index do |fi, i|
73
+ if (fi.indexed?)
74
+ files << @segment + ".f#{i}"
75
+ end
76
+ end
77
+
78
+ # Vector files
79
+ if @field_infos.has_vectors?
80
+ IndexFileNames::VECTOR_EXTENSIONS.length.times do |i|
81
+ files << @segment + "." + IndexFileNames::VECTOR_EXTENSIONS[i]
82
+ end
83
+ end
84
+
85
+ # Now merge all added files
86
+ files.each do |file|
87
+ cfs_writer.add_file(file)
88
+ end
89
+
90
+ # Perform the merge
91
+ cfs_writer.close
92
+
93
+ return files
94
+ end
95
+
96
+ #
97
+ # returns:: The number of documents in all of the readers
98
+ # raises:: IOError
99
+ def merge_fields()
100
+ @field_infos = FieldInfos.new() # merge field names
101
+ doc_count = 0
102
+ @readers.each do |reader|
103
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET), true, true, true, true)
104
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION), true, true, true, false)
105
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET), true, true, false, true)
106
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR), true, true, false, false)
107
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::INDEXED), true, false, false, false)
108
+ @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::UNINDEXED), false)
109
+ end
110
+ @field_infos.write_to_dir(@directory, @segment + ".fnm")
111
+
112
+ # merge field values
113
+ fields_writer = FieldsWriter.new(@directory, @segment, @field_infos)
114
+
115
+ begin
116
+ @readers.each do |reader|
117
+ max_doc = reader.max_doc()
118
+ max_doc.times do |j|
119
+ if not reader.deleted?(j) # skip deleted docs
120
+ fields_writer.add_document(reader.get_document(j))
121
+ doc_count += 1
122
+ end
123
+ end
124
+ end
125
+ ensure
126
+ fields_writer.close()
127
+ end
128
+ return doc_count
129
+ end
130
+
131
+ # Merge the TermVectors from each of the segments into the new one.
132
+ # raises:: IOError
133
+ def merge_vectors()
134
+ term_vectors_writer = TermVectorsWriter.new(@directory, @segment, @field_infos)
135
+
136
+ begin
137
+ @readers.each do |reader|
138
+ max_doc = reader.max_doc()
139
+ max_doc.times do |doc_num|
140
+ # skip deleted docs
141
+ next if (reader.deleted?(doc_num))
142
+ term_vectors_writer.add_all_doc_vectors(reader.get_term_vectors(doc_num))
143
+ end
144
+ end
145
+ ensure
146
+ term_vectors_writer.close()
147
+ end
148
+ end
149
+
150
+ def merge_terms()
151
+ begin
152
+ @freq_output = @directory.create_output(@segment + ".frq")
153
+ @prox_output = @directory.create_output(@segment + ".prx")
154
+ @term_infos_writer =
155
+ TermInfosWriter.new(@directory, @segment, @field_infos,
156
+ @term_index_interval)
157
+ @skip_interval = @term_infos_writer.skip_interval
158
+ @queue = SegmentMergeQueue.new(@readers.size())
159
+
160
+ merge_term_infos()
161
+
162
+ ensure
163
+ [@freq_output, @prox_output, @term_infos_writer, @queue].each do |obj|
164
+ obj.close()
165
+ end
166
+ end
167
+ end
168
+
169
+ def merge_term_infos()
170
+ base = 0
171
+ @readers.each do |reader|
172
+ term_enum = reader.terms()
173
+ smi = SegmentMergeInfo.new(base, term_enum, reader)
174
+ base += reader.num_docs()
175
+ if (smi.next?)
176
+ @queue.push(smi) # initialize @queue
177
+ else
178
+ smi.close()
179
+ end
180
+ end
181
+
182
+ match = Array.new(@readers.size)
183
+
184
+ while (@queue.size > 0)
185
+ match_size = 0 # pop matching terms
186
+ match[match_size] = @queue.pop
187
+ match_size += 1
188
+ term = match[0].term
189
+ top = @queue.top
190
+
191
+ while top and term == top.term
192
+ match[match_size] = @queue.pop
193
+ match_size += 1
194
+ top = @queue.top
195
+ end
196
+
197
+ merge_term_info(match, match_size) # add new TermInfo
198
+
199
+ while (match_size > 0)
200
+ match_size -= 1
201
+ smi = match[match_size]
202
+ if (smi.next?)
203
+ @queue.push(smi) # restore queue
204
+ else
205
+ smi.close() # done with a segment
206
+ end
207
+ end
208
+ end
209
+ end
210
+
211
+ # Merge one term found in one or more segments. The array <code>smis</code>
212
+ # contains segments that are positioned at the same term. <code>N</code>
213
+ # is the number of cells in the array actually occupied.
214
+ #
215
+ # smis:: array of segments
216
+ # n:: number of cells in the array actually occupied
217
+ def merge_term_info(smis, n)
218
+
219
+ freq_pointer = @freq_output.pos
220
+ prox_pointer = @prox_output.pos
221
+
222
+ df = append_postings(smis, n) # append posting data
223
+
224
+ skip_pointer = write_skip()
225
+
226
+ if (df > 0)
227
+ # add an enbegin to the dictionary with pointers to prox and freq files
228
+ @term_info.set_values!(df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer))
229
+ @term_infos_writer.add(smis[0].term, @term_info)
230
+ end
231
+ end
232
+
233
+ # Process postings from multiple segments all positioned on the
234
+ # same term. Writes out merged entries into @freq_utput and
235
+ # the @prox_output streams.
236
+ #
237
+ # smis:: array of segments
238
+ # n:: number of cells in the array actually occupied
239
+ # returns:: number of documents across all segments where this term was found
240
+ def append_postings(smis, n)
241
+ last_doc = 0
242
+ df = 0 # number of docs w/ term
243
+ reset_skip()
244
+ n.times do |i|
245
+ smi = smis[i]
246
+ postings = smi.postings
247
+ base = smi.base
248
+ doc_map = smi.doc_map
249
+
250
+ postings.seek(smi.term_enum)
251
+ while (postings.next?)
252
+ doc = postings.doc()
253
+ doc = doc_map[doc] if (doc_map != nil) # work around deletions
254
+ doc += base # convert to merged space
255
+
256
+ if (doc < last_doc)
257
+ raise "docs out of order curent doc = " + doc.to_s +
258
+ " and previous doc = " + last_doc.to_s
259
+ end
260
+
261
+ df += 1
262
+
263
+ if ((df % @skip_interval) == 0)
264
+ buffer_skip(last_doc)
265
+ end
266
+
267
+ doc_code = (doc - last_doc) << 1 # use low bit to flag freq=1
268
+ last_doc = doc
269
+
270
+ freq = postings.freq
271
+ if (freq == 1)
272
+ @freq_output.write_vint(doc_code | 1) # write doc & freq=1
273
+ else
274
+ @freq_output.write_vint(doc_code) # write doc
275
+ @freq_output.write_vint(freq) # write frequency in doc
276
+ end
277
+
278
+ last_position = 0 # write position deltas
279
+ freq.times do |j|
280
+ position = postings.next_position()
281
+ @prox_output.write_vint(position - last_position)
282
+ last_position = position
283
+ end
284
+ end
285
+ end
286
+ return df
287
+ end
288
+
289
+ def reset_skip()
290
+ @skip_buffer.reset()
291
+ @last_skip_doc = 0
292
+ @last_skip_freq_pointer = @freq_output.pos
293
+ @last_skip_prox_pointer = @prox_output.pos
294
+ end
295
+
296
+ def buffer_skip(doc)
297
+ freq_pointer = @freq_output.pos
298
+ prox_pointer = @prox_output.pos
299
+
300
+ @skip_buffer.write_vint(doc - @last_skip_doc)
301
+ @skip_buffer.write_vint(freq_pointer - @last_skip_freq_pointer)
302
+ @skip_buffer.write_vint(prox_pointer - @last_skip_prox_pointer)
303
+
304
+ @last_skip_doc = doc
305
+ @last_skip_freq_pointer = freq_pointer
306
+ @last_skip_prox_pointer = prox_pointer
307
+ end
308
+
309
+ def write_skip()
310
+ skip_pointer = @freq_output.pos
311
+ @skip_buffer.write_to(@freq_output)
312
+ return skip_pointer
313
+ end
314
+
315
+ def merge_norms()
316
+ @field_infos.each_with_index do |fi, i|
317
+ if (fi.indexed?)
318
+ output = @directory.create_output(@segment + ".f" + i.to_s)
319
+ begin
320
+ @readers.each do |reader|
321
+ max_doc = reader.max_doc()
322
+ input = "0" * max_doc
323
+ reader.get_norms_into(fi.name, input, 0)
324
+ max_doc.times do |k|
325
+ if not reader.deleted?(k)
326
+ output.write_byte(input[k])
327
+ end
328
+ end
329
+ end
330
+ ensure
331
+ output.close()
332
+ end
333
+ end
334
+ end
335
+ end
336
+ end
337
+ end