ferret 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/Rakefile +1 -1
  2. data/TODO +3 -0
  3. data/ext/dummy.exe +0 -0
  4. data/lib/ferret.rb +1 -1
  5. data/lib/ferret/analysis/token.rb +6 -0
  6. data/lib/ferret/analysis/tokenizers.rb +5 -5
  7. data/lib/ferret/document/document.rb +10 -13
  8. data/lib/ferret/index/compound_file_io.rb +12 -9
  9. data/lib/ferret/index/field_infos.rb +0 -6
  10. data/lib/ferret/index/index.rb +220 -102
  11. data/lib/ferret/index/index_reader.rb +22 -2
  12. data/lib/ferret/index/index_writer.rb +55 -14
  13. data/lib/ferret/index/multi_reader.rb +279 -279
  14. data/lib/ferret/index/segment_infos.rb +3 -3
  15. data/lib/ferret/index/segment_merger.rb +7 -6
  16. data/lib/ferret/index/segment_reader.rb +23 -7
  17. data/lib/ferret/index/segment_term_enum.rb +6 -7
  18. data/lib/ferret/index/term_buffer.rb +3 -5
  19. data/lib/ferret/index/term_doc_enum.rb +7 -2
  20. data/lib/ferret/index/term_infos_io.rb +15 -8
  21. data/lib/ferret/query_parser/query_parser.tab.rb +49 -45
  22. data/lib/ferret/search/boolean_query.rb +3 -4
  23. data/lib/ferret/search/boolean_scorer.rb +11 -11
  24. data/lib/ferret/search/caching_wrapper_filter.rb +1 -1
  25. data/lib/ferret/search/disjunction_sum_scorer.rb +9 -7
  26. data/lib/ferret/search/field_cache.rb +1 -2
  27. data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
  28. data/lib/ferret/search/fuzzy_term_enum.rb +64 -58
  29. data/lib/ferret/search/index_searcher.rb +16 -9
  30. data/lib/ferret/search/prefix_query.rb +7 -0
  31. data/lib/ferret/search/query_filter.rb +1 -1
  32. data/lib/ferret/search/term_scorer.rb +5 -1
  33. data/lib/ferret/search/top_docs.rb +12 -0
  34. data/lib/ferret/store/buffered_index_io.rb +5 -6
  35. data/lib/ferret/store/fs_store.rb +47 -33
  36. data/lib/ferret/store/ram_store.rb +2 -2
  37. data/lib/ferret/utils.rb +1 -0
  38. data/lib/ferret/utils/bit_vector.rb +20 -2
  39. data/lib/ferret/utils/thread_local.rb +28 -0
  40. data/lib/ferret/utils/weak_key_hash.rb +11 -2
  41. data/test/benchmark/tb_rw_vint.rb +1 -1
  42. data/test/functional/thread_safety_index_test.rb +81 -0
  43. data/test/functional/thread_safety_test.rb +137 -0
  44. data/test/test_all.rb +3 -7
  45. data/test/test_helper.rb +2 -1
  46. data/test/unit/index/tc_compound_file_io.rb +2 -2
  47. data/test/unit/index/tc_index.rb +128 -6
  48. data/test/unit/index/tc_index_reader.rb +1 -1
  49. data/test/unit/index/tc_segment_infos.rb +1 -1
  50. data/test/unit/index/th_doc.rb +1 -1
  51. data/test/unit/search/tc_index_searcher.rb +6 -0
  52. data/test/unit/store/tc_fs_store.rb +3 -3
  53. data/test/unit/utils/tc_bit_vector.rb +8 -0
  54. data/test/unit/utils/tc_thread.rb +61 -0
  55. data/test/unit/utils/tc_weak_key_hash.rb +2 -2
  56. data/test/utils/number_to_spoken.rb +132 -0
  57. metadata +7 -2
@@ -62,8 +62,11 @@ module Ferret::Index
62
62
  FieldOption.new("TERM_VECTOR_WITH_POSITION_OFFSET")
63
63
  end
64
64
 
65
- # directory:: Directory where IndexReader files reside.
66
- # segment_infos:: Used for write-l
65
+ # To create an IndexReader use the IndexReader.open method. This method
66
+ # should only be used by subclasses.
67
+ #
68
+ # directory:: Directory where IndexReader files reside.
69
+ # segment_infos:: Used for write-l
67
70
  # close_directory:: close the directory when the index reader is closed
68
71
  def initialize(directory, segment_infos = nil,
69
72
  close_directory = false, directory_owner = false)
@@ -81,7 +84,24 @@ module Ferret::Index
81
84
  end
82
85
 
83
86
  # Returns an index reader to read the index in the directory
87
+ #
88
+ # directory:: This can either be a Directory object or you can pass
89
+ # nil (RamDirectory is created) or a path (FSDirectory
90
+ # is created). If you chose the second or third options,
91
+ # you should leave close_directory as true and infos as
92
+ # nil.
93
+ # close_directory:: True if you want the IndexReader to close the
94
+ # directory when the IndexReader is closed. You'll want
95
+ # to set this to false if other objects are using the
96
+ # same directory object.
97
+ # infos:: Expert: This can be used to read an different version
98
+ # of the index but should really be left alone.
84
99
  def IndexReader.open(directory, close_directory = true, infos = nil)
100
+ if directory.nil?
101
+ directory = Ferret::Store::RAMDirectory.new
102
+ elsif directory.is_a?(String)
103
+ directory = Ferret::Store::FSDirectory.new(directory, true)
104
+ end
85
105
  directory.synchronize do # in- & inter-process sync
86
106
  commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
87
107
  commit_lock.while_locked() do
@@ -54,6 +54,9 @@ module Index
54
54
  # NOTE:: all options are passed in a hash.
55
55
  #
56
56
  # dir:: the index directory
57
+ #
58
+ # == Options
59
+ #
57
60
  # analyzer:: the analyzer to use. Defaults to StandardAnalyzer.
58
61
  # create:: +true+ to create the index or overwrite the existing
59
62
  # one +false+ to append to the existing index
@@ -62,17 +65,23 @@ module Index
62
65
  # close_dir:: This specifies whether you would this class to close
63
66
  # the index directory when this class is closed. The
64
67
  # default is false.
65
- def initialize(dir, options = {})
68
+ # use_compound_file:: Use a compound file to store the index. This is
69
+ # slower than using multiple files but it prevents the
70
+ # too many files open error. This defaults to true.
71
+ def initialize(dir = nil, options = {})
66
72
  super()
67
- create = options[:create]||false
68
- create_if_missing = options[:create_if_missing]||false
73
+ create = options[:create] || false
74
+ create_if_missing = options[:create_if_missing] || false
69
75
 
70
- if dir.instance_of?(String)
71
- @directory = FSDirectory.get_directory(dir, create||create_if_missing)
76
+ if dir.nil?
77
+ @directory = Ferret::Store::RAMDirectory.new
78
+ elsif dir.is_a?(String)
79
+ @directory = Ferret::Store::FSDirectory.new(dir, create)
72
80
  else
73
81
  @directory = dir
74
82
  end
75
83
  @close_dir = options[:close_dir] || false
84
+ @use_compound_file = (options[:use_compound_file] != false) # ie default true
76
85
  @analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
77
86
  @merge_factor = DEFAULT_MERGE_FACTOR
78
87
  @min_merge_docs = DEFAULT_MIN_MERGE_DOCS
@@ -108,6 +117,8 @@ module Index
108
117
  end
109
118
  end
110
119
  end
120
+
121
+ @info_stream = nil
111
122
  end
112
123
 
113
124
  # Flushes all changes to an index and closes all associated files.
@@ -125,11 +136,11 @@ module Index
125
136
 
126
137
  # Returns the number of documents currently in this index.
127
138
  def doc_count()
128
- count = 0
129
139
  synchronize() do
140
+ count = 0
130
141
  @segment_infos.each { |si| count += si.doc_count() }
142
+ return count
131
143
  end
132
- return count
133
144
  end
134
145
 
135
146
  # Adds a document to this index, using the provided analyzer instead of the
@@ -223,7 +234,7 @@ module Index
223
234
  merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
224
235
 
225
236
  if (@segment_infos.size() == 1) # add existing index, if any
226
- s_reader = SegmentReader.new(@segment_infos[0])
237
+ s_reader = SegmentReader.get(@segment_infos[0])
227
238
  merger << s_reader
228
239
  segments_to_delete << s_reader
229
240
  end
@@ -232,7 +243,7 @@ module Index
232
243
  merger << reader
233
244
  end
234
245
 
235
- doc_count = merger.merge!() # merge 'em
246
+ doc_count = merger.merge() # merge 'em
236
247
 
237
248
  @segment_infos.clear() # pop old infos & add new
238
249
  @segment_infos << SegmentInfo.new(merged_name, doc_count, @directory)
@@ -241,9 +252,22 @@ module Index
241
252
  @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
242
253
  @segment_infos.write(@directory) # commit changes
243
254
  delete_segments(segments_to_delete)
244
- return nil
245
255
  end
246
256
  end
257
+
258
+ if @use_compound_file
259
+ files_to_delete = merger.create_compound_file(merged_name + ".tmp")
260
+ @directory.synchronize() do # in- & inter-process sync
261
+ @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
262
+ # make compound file visible for SegmentReaders
263
+ @directory.rename(merged_name + ".tmp", merged_name + ".cfs")
264
+ # delete now unused files of segment
265
+ delete_files_and_write_undeletable(files_to_delete)
266
+ end
267
+ end
268
+ end
269
+
270
+ optimize()
247
271
  end
248
272
  end
249
273
 
@@ -379,11 +403,10 @@ module Index
379
403
  merged_doc_count = merger.merge()
380
404
 
381
405
  if (@info_stream != nil)
382
- @info_stream.print(" into " + merged_name + " (" + merged_doc_count.to_s + " docs)\n")
406
+ @info_stream.print(" into #{merged_name} (#{merged_doc_count.to_s} docs)\n")
383
407
  end
384
408
 
385
409
  (max_segment-1).downto(min_segment) {|i| @segment_infos.delete_at(i) }
386
- #@segment_infos = @segment_infos[0,min_segment] + @segment_infos[max_segment...-1]
387
410
 
388
411
  @segment_infos << SegmentInfo.new(merged_name, merged_doc_count, @directory)
389
412
 
@@ -394,10 +417,21 @@ module Index
394
417
  @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
395
418
  @segment_infos.write(@directory) # commit before deleting
396
419
  delete_segments(segments_to_delete) # delete now-unused segments
397
- return nil
398
420
  end
399
421
  end
400
- segments_to_delete.size.times {|i| segments_to_delete[i] = nil }
422
+
423
+ if @use_compound_file
424
+ files_to_delete = merger.create_compound_file(merged_name + ".tmp")
425
+ @directory.synchronize() do # in- & inter-process sync
426
+ @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
427
+ # make compound file visible for SegmentReaders
428
+ @directory.rename(merged_name + ".tmp", merged_name + ".cfs")
429
+ # delete now unused files of segment
430
+ delete_files_and_write_undeletable(files_to_delete)
431
+ end
432
+ end
433
+ end
434
+
401
435
  end
402
436
 
403
437
  # Some operating systems (e.g. Windows) don't permit a file to be
@@ -440,6 +474,13 @@ module Index
440
474
 
441
475
  end
442
476
 
477
+ def delete_files_and_write_undeletable(files)
478
+ deletable = []
479
+ try_to_delete_files(read_deleteable_files(), deletable) # try to delete deleteable
480
+ try_to_delete_files(files, deletable) # try to delete our files
481
+ write_deleteable_files(deletable) # note files we can't delete
482
+ end
483
+
443
484
  def delete_files(file_names, dir)
444
485
  file_names.each do |file_name|
445
486
  dir.delete(file_name)
@@ -1,133 +1,133 @@
1
- module Ferret
2
- module Index
3
- # An IndexReader which reads multiple indexes, appending their content.
4
- class MultiReader < IndexReader
5
- attr_reader :max_doc
6
-
7
- # Construct a MultiReader aggregating the named set of (sub)readers.
8
- # Directory locking for delete, undeleteAll, and set_norm operations is
9
- # left to the subreaders.
10
- #
11
- # Note that all subreaders are closed if this Multireader is closed.
12
- # sub_readers:: set of (sub)readers
13
- # raises:: IOException
14
- def initialize(sub_readers, directory = nil, sis = nil, close_dir = false)
15
- if (directory)
16
- super(directory, sis, close_dir)
17
- else
18
- super(sub_readers.length == 0 ? nil : sub_readers[0].directory())
1
+ module Ferret::Index
2
+ # An IndexReader which reads multiple indexes, appending their content.
3
+ class MultiReader < IndexReader
4
+ attr_reader :max_doc
5
+
6
+ # Construct a MultiReader aggregating the named set of (sub)readers.
7
+ # Directory locking for delete, undeleteAll, and set_norm operations is
8
+ # left to the subreaders.
9
+ #
10
+ # Note that all subreaders are closed if this Multireader is closed.
11
+ # sub_readers:: set of (sub)readers
12
+ # raises:: IOException
13
+ def initialize(sub_readers, directory = nil, sis = nil, close_dir = false)
14
+ if (directory)
15
+ super(directory, sis, close_dir)
16
+ else
17
+ super(sub_readers.length == 0 ? nil : sub_readers[0].directory())
18
+ end
19
+
20
+ @max_doc = 0
21
+ @num_docs = -1
22
+ @has_deletions = false
23
+
24
+ @sub_readers = sub_readers
25
+ @starts = Array.new(@sub_readers.length + 1) # build starts array
26
+ @sub_readers.each_with_index do |sub_reader, i|
27
+ @starts[i] = @max_doc
28
+ @max_doc += sub_reader.max_doc # compute max_docs
29
+
30
+ if @sub_readers[i].has_deletions?
31
+ @has_deletions = true
19
32
  end
20
-
21
- @max_doc = 0
22
- @num_docs = -1
23
- @has_deletions = false
24
-
25
- @sub_readers = sub_readers
26
- @starts = Array.new(@sub_readers.length + 1) # build starts array
27
- @sub_readers.each_with_index do |sub_reader, i|
28
- @starts[i] = @max_doc
29
- @max_doc += sub_reader.max_doc # compute maxDocs
30
-
31
- if @sub_readers[i].has_deletions?
32
- @has_deletions = true
33
- end
34
- end
35
- @starts[@sub_readers.length] = @max_doc
36
- @norms_cache = {}
37
33
  end
34
+ @starts[@sub_readers.length] = @max_doc
35
+ @norms_cache = {}
36
+ end
38
37
 
39
38
 
40
- # Return an array of term frequency vectors for the specified document. The
41
- # array contains a vector for each vectorized field in the document. Each
42
- # vector vector contains term numbers and frequencies for all terms in a
43
- # given vectorized field. If no such fields existed, the method returns
44
- # nil.
45
- def get_term_vectors(n)
46
- i = reader_index(n) # find segment num
47
- return @sub_readers[i].get_term_vectors(n - @starts[i]); # dispatch to segment
48
- end
39
+ # Return an array of term frequency vectors for the specified document. The
40
+ # array contains a vector for each vectorized field in the document. Each
41
+ # vector vector contains term numbers and frequencies for all terms in a
42
+ # given vectorized field. If no such fields existed, the method returns
43
+ # nil.
44
+ def get_term_vectors(n)
45
+ i = reader_index(n) # find segment num
46
+ return @sub_readers[i].get_term_vectors(n - @starts[i]); # dispatch to segment
47
+ end
49
48
 
50
- def get_term_vector(n, field)
51
- i = reader_index(n) # find segment num
52
- return @sub_readers[i].get_term_vector(n - @starts[i], field)
53
- end
49
+ def get_term_vector(n, field)
50
+ i = reader_index(n) # find segment num
51
+ return @sub_readers[i].get_term_vector(n - @starts[i], field)
52
+ end
54
53
 
55
- def num_docs()
56
- synchronize do
57
- if (@num_docs == -1) # check cache
58
- n = 0 # cache miss -= 1recompute
59
- @sub_readers.each {|reader| n += reader.num_docs()}
60
- @num_docs = n
61
- end
62
- return @num_docs
54
+ def num_docs()
55
+ synchronize do
56
+ if (@num_docs == -1) # check cache
57
+ n = 0 # cache miss -= 1recompute
58
+ @sub_readers.each {|reader| n += reader.num_docs()}
59
+ @num_docs = n
63
60
  end
61
+ return @num_docs
64
62
  end
63
+ end
65
64
 
66
- def get_document(n)
67
- i = reader_index(n) # find segment num
68
- return @sub_readers[i].get_document(n - @starts[i]) # dispatch to segment reader
69
- end
65
+ def get_document(n)
66
+ i = reader_index(n) # find segment num
67
+ return @sub_readers[i].get_document(n - @starts[i]) # dispatch to segment reader
68
+ end
70
69
 
71
- def deleted?(n)
72
- i = reader_index(n) # find segment num
73
- return @sub_readers[i].deleted?(n - @starts[i]) # dispatch to segment reader
74
- end
70
+ def deleted?(n)
71
+ i = reader_index(n) # find segment num
72
+ return @sub_readers[i].deleted?(n - @starts[i]) # dispatch to segment reader
73
+ end
75
74
 
76
- def has_deletions?()
77
- return @has_deletions
78
- end
75
+ def has_deletions?()
76
+ return @has_deletions
77
+ end
79
78
 
80
- def do_delete(n)
81
- @num_docs = -1 # invalidate cache
82
- i = reader_index(n) # find segment num
83
- @sub_readers[i].delete(n - @starts[i]) # dispatch to segment reader
84
- @has_deletions = true
85
- end
79
+ def do_delete(n)
80
+ @num_docs = -1 # invalidate cache
81
+ i = reader_index(n) # find segment num
82
+ @sub_readers[i].delete(n - @starts[i]) # dispatch to segment reader
83
+ @has_deletions = true
84
+ end
86
85
 
87
- def do_undelete_all()
88
- @num_docs = -1 # invalidate cache
89
- @sub_readers.each {|reader| reader.undelete_all() }
90
- @has_deletions = false
91
- end
86
+ def do_undelete_all()
87
+ @num_docs = -1 # invalidate cache
88
+ @sub_readers.each {|reader| reader.undelete_all() }
89
+ @has_deletions = false
90
+ end
92
91
 
93
- def reader_index(n) # find reader for doc n:
94
- lo = 0 # search @starts array
95
- hi = @sub_readers.length - 1 # for first element less
96
-
97
- while (hi >= lo)
98
- mid = (lo + hi) >> 1
99
- mid_value = @starts[mid]
100
- if (n < mid_value)
101
- hi = mid - 1
102
- elsif (n > mid_value)
103
- lo = mid + 1
104
- else # found a match
105
- while (mid+1 < @sub_readers.length and @starts[mid+1] == mid_value)
106
- mid += 1 # scan to last match
107
- end
108
- return mid
92
+ def reader_index(n) # find reader for doc n:
93
+ lo = 0 # search @starts array
94
+ hi = @sub_readers.length - 1 # for first element less
95
+
96
+ while (hi >= lo)
97
+ mid = (lo + hi) >> 1
98
+ mid_value = @starts[mid]
99
+ if (n < mid_value)
100
+ hi = mid - 1
101
+ elsif (n > mid_value)
102
+ lo = mid + 1
103
+ else # found a match
104
+ while (mid+1 < @sub_readers.length and @starts[mid+1] == mid_value)
105
+ mid += 1 # scan to last match
109
106
  end
107
+ return mid
110
108
  end
111
- return hi
112
109
  end
110
+ return hi
111
+ end
113
112
 
114
- def get_norms(field)
115
- synchronize do
116
- bytes = @norms_cache[field]
117
- if (bytes != nil)
118
- return bytes # cache hit
119
- end
113
+ def get_norms(field)
114
+ synchronize do
115
+ bytes = @norms_cache[field]
116
+ if (bytes != nil)
117
+ return bytes # cache hit
118
+ end
120
119
 
121
- bytes = " " * @max_doc
122
- @sub_readers.length.times do |i|
123
- @sub_readers[i].get_norms_into(field, bytes, @starts[i])
124
- end
125
- @norms_cache[field] = bytes # update cache
126
- return bytes
120
+ bytes = " " * @max_doc
121
+ @sub_readers.length.times do |i|
122
+ @sub_readers[i].get_norms_into(field, bytes, @starts[i])
127
123
  end
124
+ @norms_cache[field] = bytes # update cache
125
+ return bytes
128
126
  end
127
+ end
129
128
 
130
- def get_norms_into(field, buf, offset)
129
+ def get_norms_into(field, buf, offset)
130
+ synchronize do
131
131
  bytes = @norms_cache[field]
132
132
  if (bytes != nil) # cache hit
133
133
  buf[offset ,@max_doc] = bytes[0, @max_doc]
@@ -138,226 +138,226 @@ module Ferret
138
138
  @sub_readers[i].get_norms_into(field, buf, offset + @starts[i])
139
139
  end
140
140
  end
141
+ end
141
142
 
142
- def do_set_norm(n, field, value)
143
- @norms_cache.delete(field) # clear cache
144
- i = reader_index(n) # find segment num
145
- @sub_readers[i].set_norm(n-@starts[i], field, value); # dispatch
146
- end
143
+ def do_set_norm(n, field, value)
144
+ @norms_cache.delete(field) # clear cache
145
+ i = reader_index(n) # find segment num
146
+ @sub_readers[i].set_norm(n-@starts[i], field, value); # dispatch
147
+ end
147
148
 
148
- def terms()
149
- return MultiTermEnum.new(@sub_readers, @starts, nil)
150
- end
149
+ def terms()
150
+ return MultiTermEnum.new(@sub_readers, @starts, nil)
151
+ end
151
152
 
152
- def terms_from(term)
153
- return MultiTermEnum.new(@sub_readers, @starts, term)
154
- end
153
+ def terms_from(term)
154
+ return MultiTermEnum.new(@sub_readers, @starts, term)
155
+ end
155
156
 
156
- def doc_freq(t)
157
- total = 0 # sum freqs in segments
158
- @sub_readers.each {|reader| total += reader.doc_freq(t)}
159
- return total
160
- end
157
+ def doc_freq(t)
158
+ total = 0 # sum freqs in segments
159
+ @sub_readers.each {|reader| total += reader.doc_freq(t)}
160
+ return total
161
+ end
161
162
 
162
- def term_docs()
163
- return MultiTermDocEnum.new(@sub_readers, @starts)
164
- end
163
+ def term_docs()
164
+ return MultiTermDocEnum.new(@sub_readers, @starts)
165
+ end
165
166
 
166
- def term_positions()
167
- return MultiTermDocPosEnum.new(@sub_readers, @starts)
168
- end
167
+ def term_positions()
168
+ return MultiTermDocPosEnum.new(@sub_readers, @starts)
169
+ end
169
170
 
170
- def do_commit()
171
- @sub_readers.each {|reader| reader.commit() }
172
- end
171
+ def do_commit()
172
+ @sub_readers.each {|reader| reader.commit() }
173
+ end
173
174
 
174
- def do_close()
175
- synchronize do
176
- @sub_readers.each {|reader| reader.close() }
177
- end
175
+ def do_close()
176
+ synchronize do
177
+ @sub_readers.each {|reader| reader.close() }
178
178
  end
179
+ end
179
180
 
180
- # See IndexReader#get_field_names
181
- def get_field_names(field_option = IndexReader::FieldOption::ALL)
182
- # maintain a unique set of field names
183
- field_set = Set.new
184
- @sub_readers.each do |reader|
185
- field_set |= reader.get_field_names(field_option)
186
- end
187
- return field_set
181
+ # See IndexReader#get_field_names
182
+ def get_field_names(field_option = IndexReader::FieldOption::ALL)
183
+ # maintain a unique set of field names
184
+ field_set = Set.new
185
+ @sub_readers.each do |reader|
186
+ field_set |= reader.get_field_names(field_option)
188
187
  end
188
+ return field_set
189
189
  end
190
+ end
190
191
 
191
- class MultiTermEnum < TermEnum
192
+ class MultiTermEnum < TermEnum
192
193
 
193
- attr_reader :doc_freq, :term
194
+ attr_reader :doc_freq, :term
194
195
 
195
- def initialize(readers, starts, t)
196
- @queue = SegmentMergeQueue.new(readers.length)
197
- readers.each_index do |i|
198
- reader = readers[i]
199
- term_enum = nil
200
- if (t != nil)
201
- term_enum = reader.terms_from(t)
202
- else
203
- term_enum = reader.terms()
204
- end
205
- smi = SegmentMergeInfo.new(starts[i], term_enum, reader)
206
-
207
- if (t == nil and smi.next?) or term_enum.term
208
- @queue.push(smi); # initialize queue
209
- else
210
- smi.close()
211
- end
196
+ def initialize(readers, starts, t)
197
+ @queue = SegmentMergeQueue.new(readers.length)
198
+ readers.each_index do |i|
199
+ reader = readers[i]
200
+ term_enum = nil
201
+ if (t != nil)
202
+ term_enum = reader.terms_from(t)
203
+ else
204
+ term_enum = reader.terms()
212
205
  end
206
+ smi = SegmentMergeInfo.new(starts[i], term_enum, reader)
213
207
 
214
- if (t != nil and @queue.size() > 0)
215
- next?()
208
+ if (t == nil and smi.next?) or term_enum.term
209
+ @queue.push(smi); # initialize queue
210
+ else
211
+ smi.close()
216
212
  end
217
213
  end
218
214
 
219
- def next?()
220
- top = @queue.top()
221
- if (top == nil)
222
- @term = nil
223
- return false
224
- end
215
+ if (t != nil and @queue.size() > 0)
216
+ next?()
217
+ end
218
+ end
219
+
220
+ def next?()
221
+ top = @queue.top()
222
+ if (top == nil)
223
+ @term = nil
224
+ return false
225
+ end
225
226
 
226
- @term = top.term
227
- @doc_freq = 0
227
+ @term = top.term
228
+ @doc_freq = 0
228
229
 
229
- while top and @term == top.term
230
- @queue.pop()
231
- @doc_freq += top.term_enum.doc_freq() # increment freq
232
- if (top.next?)
233
- @queue.push(top) # restore queue
234
- else
235
- top.close() # done with a segment
236
- end
237
- top = @queue.top()
230
+ while top and @term == top.term
231
+ @queue.pop()
232
+ @doc_freq += top.term_enum.doc_freq() # increment freq
233
+ if (top.next?)
234
+ @queue.push(top) # restore queue
235
+ else
236
+ top.close() # done with a segment
238
237
  end
239
- return true
238
+ top = @queue.top()
240
239
  end
240
+ return true
241
+ end
241
242
 
242
- def close()
243
- @queue.close()
244
- end
243
+ def close()
244
+ @queue.close()
245
245
  end
246
+ end
246
247
 
247
- class MultiTermDocEnum < TermDocEnum
248
- attr_accessor :readers, :starts, :term, :base, :pointer, :current
248
+ class MultiTermDocEnum < TermDocEnum
249
+ attr_accessor :readers, :starts, :term, :base, :pointer, :current
249
250
 
250
- def initialize(readers, starts)
251
- @readers = readers
252
- @starts = starts
253
- @base = 0
254
- @pointer = 0
251
+ def initialize(readers, starts)
252
+ @readers = readers
253
+ @starts = starts
254
+ @base = 0
255
+ @pointer = 0
255
256
 
256
- @reader_term_docs = Array.new(readers.length)
257
- end
257
+ @reader_term_docs = Array.new(readers.length)
258
+ end
258
259
 
259
- def doc
260
- return @base + @current.doc()
261
- end
260
+ def doc
261
+ return @base + @current.doc()
262
+ end
262
263
 
263
- def freq
264
- return @current.freq()
265
- end
264
+ def freq
265
+ return @current.freq()
266
+ end
266
267
 
267
- def seek(term)
268
- @term = term
269
- @base = 0
270
- @pointer = 0
271
- @current = nil
272
- end
268
+ def seek(term)
269
+ @term = term
270
+ @base = 0
271
+ @pointer = 0
272
+ @current = nil
273
+ end
273
274
 
274
- def next?
275
- if @current and @current.next?
276
- return true
277
- elsif @pointer < @readers.length
278
- @base = @starts[@pointer]
279
- @current = term_docs(@pointer)
280
- @pointer += 1
281
- return next?()
282
- else
283
- return false
284
- end
275
+ def next?
276
+ if @current and @current.next?
277
+ return true
278
+ elsif @pointer < @readers.length
279
+ @base = @starts[@pointer]
280
+ @current = term_docs(@pointer)
281
+ @pointer += 1
282
+ return next?()
283
+ else
284
+ return false
285
285
  end
286
+ end
286
287
 
287
- # Optimized implementation. Unlike the Java version, this method
288
- # always returns as many results as it can read.
289
- def read(docs, freqs)
290
- got = 0
291
- last_got = 0
292
- needed = docs.length
293
-
294
- while (true)
295
- while @current.nil?
296
- if @pointer < @readers.length # begin next segment
297
- @base = @starts[@pointer]
298
- @current = term_docs(@pointer)
299
- @pointer += 1
300
- else
301
- return got
302
- end
288
+ # Optimized implementation. Unlike the Java version, this method
289
+ # always returns as many results as it can read.
290
+ def read(docs, freqs)
291
+ got = 0
292
+ last_got = 0
293
+ needed = docs.length
294
+
295
+ while (true)
296
+ while @current.nil?
297
+ if @pointer < @readers.length # try next segment
298
+ @base = @starts[@pointer]
299
+ @current = term_docs(@pointer)
300
+ @pointer += 1
301
+ else
302
+ return got
303
303
  end
304
- got = @current.read(docs, freqs, got)
305
- if (got == last_got) # none left in segment
306
- @current = nil
307
- else # got some
308
- b = @base # adjust doc numbers
309
- (last_got...got).each {|i| docs[i] += b}
310
- if got == needed
311
- return got
312
- else
313
- last_got = got
314
- end
304
+ end
305
+ got = @current.read(docs, freqs, got)
306
+ if (got == last_got) # none left in segment
307
+ @current = nil
308
+ else # got some
309
+ b = @base # adjust doc numbers
310
+ (last_got...got).each {|i| docs[i] += b}
311
+ if got == needed
312
+ return got
313
+ else
314
+ last_got = got
315
315
  end
316
316
  end
317
317
  end
318
+ end
318
319
 
319
- # As yet unoptimized implementation.
320
- def skip_to(target)
321
- begin
322
- return false if not next?
323
- end while target > doc()
324
- return true
325
- end
326
-
327
- def term_docs(i)
328
- return nil if (@term == nil)
329
- result = @reader_term_docs[i]
330
- if (result == nil)
331
- result = @reader_term_docs[i] = term_docs_from_reader(@readers[i])
332
- end
333
- result.seek(@term)
334
- return result
335
- end
320
+ # As yet unoptimized implementation.
321
+ def skip_to(target)
322
+ begin
323
+ return false if not next?
324
+ end while target > doc()
325
+ return true
326
+ end
336
327
 
337
- def term_docs_from_reader(reader)
338
- return reader.term_docs()
328
+ def term_docs(i)
329
+ return nil if (@term == nil)
330
+ result = @reader_term_docs[i]
331
+ if (result == nil)
332
+ result = @reader_term_docs[i] = term_docs_from_reader(@readers[i])
339
333
  end
334
+ result.seek(@term)
335
+ return result
336
+ end
340
337
 
341
- def close()
342
- @reader_term_docs.compact.each do |rtd|
343
- rtd.close()
344
- end
345
- end
338
+ def term_docs_from_reader(reader)
339
+ return reader.term_docs()
346
340
  end
347
341
 
348
- class MultiTermDocPosEnum < MultiTermDocEnum
349
- def initialize(r, s)
350
- super(r,s)
342
+ def close()
343
+ @reader_term_docs.compact.each do |rtd|
344
+ rtd.close()
351
345
  end
346
+ end
347
+ end
352
348
 
353
- def term_docs_from_reader(reader)
354
- return reader.term_positions()
355
- end
349
+ class MultiTermDocPosEnum < MultiTermDocEnum
350
+ def initialize(r, s)
351
+ super(r,s)
352
+ end
356
353
 
357
- def next_position()
358
- return @current.next_position()
359
- end
354
+ def term_docs_from_reader(reader)
355
+ return reader.term_positions()
356
+ end
360
357
 
358
+ def next_position()
359
+ return @current.next_position()
361
360
  end
361
+
362
362
  end
363
363
  end