ferret 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/TODO +3 -0
- data/ext/dummy.exe +0 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/token.rb +6 -0
- data/lib/ferret/analysis/tokenizers.rb +5 -5
- data/lib/ferret/document/document.rb +10 -13
- data/lib/ferret/index/compound_file_io.rb +12 -9
- data/lib/ferret/index/field_infos.rb +0 -6
- data/lib/ferret/index/index.rb +220 -102
- data/lib/ferret/index/index_reader.rb +22 -2
- data/lib/ferret/index/index_writer.rb +55 -14
- data/lib/ferret/index/multi_reader.rb +279 -279
- data/lib/ferret/index/segment_infos.rb +3 -3
- data/lib/ferret/index/segment_merger.rb +7 -6
- data/lib/ferret/index/segment_reader.rb +23 -7
- data/lib/ferret/index/segment_term_enum.rb +6 -7
- data/lib/ferret/index/term_buffer.rb +3 -5
- data/lib/ferret/index/term_doc_enum.rb +7 -2
- data/lib/ferret/index/term_infos_io.rb +15 -8
- data/lib/ferret/query_parser/query_parser.tab.rb +49 -45
- data/lib/ferret/search/boolean_query.rb +3 -4
- data/lib/ferret/search/boolean_scorer.rb +11 -11
- data/lib/ferret/search/caching_wrapper_filter.rb +1 -1
- data/lib/ferret/search/disjunction_sum_scorer.rb +9 -7
- data/lib/ferret/search/field_cache.rb +1 -2
- data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
- data/lib/ferret/search/fuzzy_term_enum.rb +64 -58
- data/lib/ferret/search/index_searcher.rb +16 -9
- data/lib/ferret/search/prefix_query.rb +7 -0
- data/lib/ferret/search/query_filter.rb +1 -1
- data/lib/ferret/search/term_scorer.rb +5 -1
- data/lib/ferret/search/top_docs.rb +12 -0
- data/lib/ferret/store/buffered_index_io.rb +5 -6
- data/lib/ferret/store/fs_store.rb +47 -33
- data/lib/ferret/store/ram_store.rb +2 -2
- data/lib/ferret/utils.rb +1 -0
- data/lib/ferret/utils/bit_vector.rb +20 -2
- data/lib/ferret/utils/thread_local.rb +28 -0
- data/lib/ferret/utils/weak_key_hash.rb +11 -2
- data/test/benchmark/tb_rw_vint.rb +1 -1
- data/test/functional/thread_safety_index_test.rb +81 -0
- data/test/functional/thread_safety_test.rb +137 -0
- data/test/test_all.rb +3 -7
- data/test/test_helper.rb +2 -1
- data/test/unit/index/tc_compound_file_io.rb +2 -2
- data/test/unit/index/tc_index.rb +128 -6
- data/test/unit/index/tc_index_reader.rb +1 -1
- data/test/unit/index/tc_segment_infos.rb +1 -1
- data/test/unit/index/th_doc.rb +1 -1
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/store/tc_fs_store.rb +3 -3
- data/test/unit/utils/tc_bit_vector.rb +8 -0
- data/test/unit/utils/tc_thread.rb +61 -0
- data/test/unit/utils/tc_weak_key_hash.rb +2 -2
- data/test/utils/number_to_spoken.rb +132 -0
- metadata +7 -2
@@ -62,8 +62,11 @@ module Ferret::Index
|
|
62
62
|
FieldOption.new("TERM_VECTOR_WITH_POSITION_OFFSET")
|
63
63
|
end
|
64
64
|
|
65
|
-
#
|
66
|
-
#
|
65
|
+
# To create an IndexReader use the IndexReader.open method. This method
|
66
|
+
# should only be used by subclasses.
|
67
|
+
#
|
68
|
+
# directory:: Directory where IndexReader files reside.
|
69
|
+
# segment_infos:: Used for write-l
|
67
70
|
# close_directory:: close the directory when the index reader is closed
|
68
71
|
def initialize(directory, segment_infos = nil,
|
69
72
|
close_directory = false, directory_owner = false)
|
@@ -81,7 +84,24 @@ module Ferret::Index
|
|
81
84
|
end
|
82
85
|
|
83
86
|
# Returns an index reader to read the index in the directory
|
87
|
+
#
|
88
|
+
# directory:: This can either be a Directory object or you can pass
|
89
|
+
# nil (RamDirectory is created) or a path (FSDirectory
|
90
|
+
# is created). If you chose the second or third options,
|
91
|
+
# you should leave close_directory as true and infos as
|
92
|
+
# nil.
|
93
|
+
# close_directory:: True if you want the IndexReader to close the
|
94
|
+
# directory when the IndexReader is closed. You'll want
|
95
|
+
# to set this to false if other objects are using the
|
96
|
+
# same directory object.
|
97
|
+
# infos:: Expert: This can be used to read an different version
|
98
|
+
# of the index but should really be left alone.
|
84
99
|
def IndexReader.open(directory, close_directory = true, infos = nil)
|
100
|
+
if directory.nil?
|
101
|
+
directory = Ferret::Store::RAMDirectory.new
|
102
|
+
elsif directory.is_a?(String)
|
103
|
+
directory = Ferret::Store::FSDirectory.new(directory, true)
|
104
|
+
end
|
85
105
|
directory.synchronize do # in- & inter-process sync
|
86
106
|
commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
|
87
107
|
commit_lock.while_locked() do
|
@@ -54,6 +54,9 @@ module Index
|
|
54
54
|
# NOTE:: all options are passed in a hash.
|
55
55
|
#
|
56
56
|
# dir:: the index directory
|
57
|
+
#
|
58
|
+
# == Options
|
59
|
+
#
|
57
60
|
# analyzer:: the analyzer to use. Defaults to StandardAnalyzer.
|
58
61
|
# create:: +true+ to create the index or overwrite the existing
|
59
62
|
# one +false+ to append to the existing index
|
@@ -62,17 +65,23 @@ module Index
|
|
62
65
|
# close_dir:: This specifies whether you would this class to close
|
63
66
|
# the index directory when this class is closed. The
|
64
67
|
# default is false.
|
65
|
-
|
68
|
+
# use_compound_file:: Use a compound file to store the index. This is
|
69
|
+
# slower than using multiple files but it prevents the
|
70
|
+
# too many files open error. This defaults to true.
|
71
|
+
def initialize(dir = nil, options = {})
|
66
72
|
super()
|
67
|
-
create = options[:create]||false
|
68
|
-
create_if_missing = options[:create_if_missing]||false
|
73
|
+
create = options[:create] || false
|
74
|
+
create_if_missing = options[:create_if_missing] || false
|
69
75
|
|
70
|
-
if dir.
|
71
|
-
@directory =
|
76
|
+
if dir.nil?
|
77
|
+
@directory = Ferret::Store::RAMDirectory.new
|
78
|
+
elsif dir.is_a?(String)
|
79
|
+
@directory = Ferret::Store::FSDirectory.new(dir, create)
|
72
80
|
else
|
73
81
|
@directory = dir
|
74
82
|
end
|
75
83
|
@close_dir = options[:close_dir] || false
|
84
|
+
@use_compound_file = (options[:use_compound_file] != false) # ie default true
|
76
85
|
@analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
|
77
86
|
@merge_factor = DEFAULT_MERGE_FACTOR
|
78
87
|
@min_merge_docs = DEFAULT_MIN_MERGE_DOCS
|
@@ -108,6 +117,8 @@ module Index
|
|
108
117
|
end
|
109
118
|
end
|
110
119
|
end
|
120
|
+
|
121
|
+
@info_stream = nil
|
111
122
|
end
|
112
123
|
|
113
124
|
# Flushes all changes to an index and closes all associated files.
|
@@ -125,11 +136,11 @@ module Index
|
|
125
136
|
|
126
137
|
# Returns the number of documents currently in this index.
|
127
138
|
def doc_count()
|
128
|
-
count = 0
|
129
139
|
synchronize() do
|
140
|
+
count = 0
|
130
141
|
@segment_infos.each { |si| count += si.doc_count() }
|
142
|
+
return count
|
131
143
|
end
|
132
|
-
return count
|
133
144
|
end
|
134
145
|
|
135
146
|
# Adds a document to this index, using the provided analyzer instead of the
|
@@ -223,7 +234,7 @@ module Index
|
|
223
234
|
merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
|
224
235
|
|
225
236
|
if (@segment_infos.size() == 1) # add existing index, if any
|
226
|
-
s_reader = SegmentReader.
|
237
|
+
s_reader = SegmentReader.get(@segment_infos[0])
|
227
238
|
merger << s_reader
|
228
239
|
segments_to_delete << s_reader
|
229
240
|
end
|
@@ -232,7 +243,7 @@ module Index
|
|
232
243
|
merger << reader
|
233
244
|
end
|
234
245
|
|
235
|
-
doc_count = merger.merge
|
246
|
+
doc_count = merger.merge() # merge 'em
|
236
247
|
|
237
248
|
@segment_infos.clear() # pop old infos & add new
|
238
249
|
@segment_infos << SegmentInfo.new(merged_name, doc_count, @directory)
|
@@ -241,9 +252,22 @@ module Index
|
|
241
252
|
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
242
253
|
@segment_infos.write(@directory) # commit changes
|
243
254
|
delete_segments(segments_to_delete)
|
244
|
-
return nil
|
245
255
|
end
|
246
256
|
end
|
257
|
+
|
258
|
+
if @use_compound_file
|
259
|
+
files_to_delete = merger.create_compound_file(merged_name + ".tmp")
|
260
|
+
@directory.synchronize() do # in- & inter-process sync
|
261
|
+
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
262
|
+
# make compound file visible for SegmentReaders
|
263
|
+
@directory.rename(merged_name + ".tmp", merged_name + ".cfs")
|
264
|
+
# delete now unused files of segment
|
265
|
+
delete_files_and_write_undeletable(files_to_delete)
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
optimize()
|
247
271
|
end
|
248
272
|
end
|
249
273
|
|
@@ -379,11 +403,10 @@ module Index
|
|
379
403
|
merged_doc_count = merger.merge()
|
380
404
|
|
381
405
|
if (@info_stream != nil)
|
382
|
-
@info_stream.print(" into
|
406
|
+
@info_stream.print(" into #{merged_name} (#{merged_doc_count.to_s} docs)\n")
|
383
407
|
end
|
384
408
|
|
385
409
|
(max_segment-1).downto(min_segment) {|i| @segment_infos.delete_at(i) }
|
386
|
-
#@segment_infos = @segment_infos[0,min_segment] + @segment_infos[max_segment...-1]
|
387
410
|
|
388
411
|
@segment_infos << SegmentInfo.new(merged_name, merged_doc_count, @directory)
|
389
412
|
|
@@ -394,10 +417,21 @@ module Index
|
|
394
417
|
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
395
418
|
@segment_infos.write(@directory) # commit before deleting
|
396
419
|
delete_segments(segments_to_delete) # delete now-unused segments
|
397
|
-
return nil
|
398
420
|
end
|
399
421
|
end
|
400
|
-
|
422
|
+
|
423
|
+
if @use_compound_file
|
424
|
+
files_to_delete = merger.create_compound_file(merged_name + ".tmp")
|
425
|
+
@directory.synchronize() do # in- & inter-process sync
|
426
|
+
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
427
|
+
# make compound file visible for SegmentReaders
|
428
|
+
@directory.rename(merged_name + ".tmp", merged_name + ".cfs")
|
429
|
+
# delete now unused files of segment
|
430
|
+
delete_files_and_write_undeletable(files_to_delete)
|
431
|
+
end
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
401
435
|
end
|
402
436
|
|
403
437
|
# Some operating systems (e.g. Windows) don't permit a file to be
|
@@ -440,6 +474,13 @@ module Index
|
|
440
474
|
|
441
475
|
end
|
442
476
|
|
477
|
+
def delete_files_and_write_undeletable(files)
|
478
|
+
deletable = []
|
479
|
+
try_to_delete_files(read_deleteable_files(), deletable) # try to delete deleteable
|
480
|
+
try_to_delete_files(files, deletable) # try to delete our files
|
481
|
+
write_deleteable_files(deletable) # note files we can't delete
|
482
|
+
end
|
483
|
+
|
443
484
|
def delete_files(file_names, dir)
|
444
485
|
file_names.each do |file_name|
|
445
486
|
dir.delete(file_name)
|
@@ -1,133 +1,133 @@
|
|
1
|
-
module Ferret
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
1
|
+
module Ferret::Index
|
2
|
+
# An IndexReader which reads multiple indexes, appending their content.
|
3
|
+
class MultiReader < IndexReader
|
4
|
+
attr_reader :max_doc
|
5
|
+
|
6
|
+
# Construct a MultiReader aggregating the named set of (sub)readers.
|
7
|
+
# Directory locking for delete, undeleteAll, and set_norm operations is
|
8
|
+
# left to the subreaders.
|
9
|
+
#
|
10
|
+
# Note that all subreaders are closed if this Multireader is closed.
|
11
|
+
# sub_readers:: set of (sub)readers
|
12
|
+
# raises:: IOException
|
13
|
+
def initialize(sub_readers, directory = nil, sis = nil, close_dir = false)
|
14
|
+
if (directory)
|
15
|
+
super(directory, sis, close_dir)
|
16
|
+
else
|
17
|
+
super(sub_readers.length == 0 ? nil : sub_readers[0].directory())
|
18
|
+
end
|
19
|
+
|
20
|
+
@max_doc = 0
|
21
|
+
@num_docs = -1
|
22
|
+
@has_deletions = false
|
23
|
+
|
24
|
+
@sub_readers = sub_readers
|
25
|
+
@starts = Array.new(@sub_readers.length + 1) # build starts array
|
26
|
+
@sub_readers.each_with_index do |sub_reader, i|
|
27
|
+
@starts[i] = @max_doc
|
28
|
+
@max_doc += sub_reader.max_doc # compute max_docs
|
29
|
+
|
30
|
+
if @sub_readers[i].has_deletions?
|
31
|
+
@has_deletions = true
|
19
32
|
end
|
20
|
-
|
21
|
-
@max_doc = 0
|
22
|
-
@num_docs = -1
|
23
|
-
@has_deletions = false
|
24
|
-
|
25
|
-
@sub_readers = sub_readers
|
26
|
-
@starts = Array.new(@sub_readers.length + 1) # build starts array
|
27
|
-
@sub_readers.each_with_index do |sub_reader, i|
|
28
|
-
@starts[i] = @max_doc
|
29
|
-
@max_doc += sub_reader.max_doc # compute maxDocs
|
30
|
-
|
31
|
-
if @sub_readers[i].has_deletions?
|
32
|
-
@has_deletions = true
|
33
|
-
end
|
34
|
-
end
|
35
|
-
@starts[@sub_readers.length] = @max_doc
|
36
|
-
@norms_cache = {}
|
37
33
|
end
|
34
|
+
@starts[@sub_readers.length] = @max_doc
|
35
|
+
@norms_cache = {}
|
36
|
+
end
|
38
37
|
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
39
|
+
# Return an array of term frequency vectors for the specified document. The
|
40
|
+
# array contains a vector for each vectorized field in the document. Each
|
41
|
+
# vector vector contains term numbers and frequencies for all terms in a
|
42
|
+
# given vectorized field. If no such fields existed, the method returns
|
43
|
+
# nil.
|
44
|
+
def get_term_vectors(n)
|
45
|
+
i = reader_index(n) # find segment num
|
46
|
+
return @sub_readers[i].get_term_vectors(n - @starts[i]); # dispatch to segment
|
47
|
+
end
|
49
48
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
49
|
+
def get_term_vector(n, field)
|
50
|
+
i = reader_index(n) # find segment num
|
51
|
+
return @sub_readers[i].get_term_vector(n - @starts[i], field)
|
52
|
+
end
|
54
53
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
end
|
62
|
-
return @num_docs
|
54
|
+
def num_docs()
|
55
|
+
synchronize do
|
56
|
+
if (@num_docs == -1) # check cache
|
57
|
+
n = 0 # cache miss -= 1recompute
|
58
|
+
@sub_readers.each {|reader| n += reader.num_docs()}
|
59
|
+
@num_docs = n
|
63
60
|
end
|
61
|
+
return @num_docs
|
64
62
|
end
|
63
|
+
end
|
65
64
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
65
|
+
def get_document(n)
|
66
|
+
i = reader_index(n) # find segment num
|
67
|
+
return @sub_readers[i].get_document(n - @starts[i]) # dispatch to segment reader
|
68
|
+
end
|
70
69
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
70
|
+
def deleted?(n)
|
71
|
+
i = reader_index(n) # find segment num
|
72
|
+
return @sub_readers[i].deleted?(n - @starts[i]) # dispatch to segment reader
|
73
|
+
end
|
75
74
|
|
76
|
-
|
77
|
-
|
78
|
-
|
75
|
+
def has_deletions?()
|
76
|
+
return @has_deletions
|
77
|
+
end
|
79
78
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
79
|
+
def do_delete(n)
|
80
|
+
@num_docs = -1 # invalidate cache
|
81
|
+
i = reader_index(n) # find segment num
|
82
|
+
@sub_readers[i].delete(n - @starts[i]) # dispatch to segment reader
|
83
|
+
@has_deletions = true
|
84
|
+
end
|
86
85
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
86
|
+
def do_undelete_all()
|
87
|
+
@num_docs = -1 # invalidate cache
|
88
|
+
@sub_readers.each {|reader| reader.undelete_all() }
|
89
|
+
@has_deletions = false
|
90
|
+
end
|
92
91
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
end
|
108
|
-
return mid
|
92
|
+
def reader_index(n) # find reader for doc n:
|
93
|
+
lo = 0 # search @starts array
|
94
|
+
hi = @sub_readers.length - 1 # for first element less
|
95
|
+
|
96
|
+
while (hi >= lo)
|
97
|
+
mid = (lo + hi) >> 1
|
98
|
+
mid_value = @starts[mid]
|
99
|
+
if (n < mid_value)
|
100
|
+
hi = mid - 1
|
101
|
+
elsif (n > mid_value)
|
102
|
+
lo = mid + 1
|
103
|
+
else # found a match
|
104
|
+
while (mid+1 < @sub_readers.length and @starts[mid+1] == mid_value)
|
105
|
+
mid += 1 # scan to last match
|
109
106
|
end
|
107
|
+
return mid
|
110
108
|
end
|
111
|
-
return hi
|
112
109
|
end
|
110
|
+
return hi
|
111
|
+
end
|
113
112
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
113
|
+
def get_norms(field)
|
114
|
+
synchronize do
|
115
|
+
bytes = @norms_cache[field]
|
116
|
+
if (bytes != nil)
|
117
|
+
return bytes # cache hit
|
118
|
+
end
|
120
119
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
end
|
125
|
-
@norms_cache[field] = bytes # update cache
|
126
|
-
return bytes
|
120
|
+
bytes = " " * @max_doc
|
121
|
+
@sub_readers.length.times do |i|
|
122
|
+
@sub_readers[i].get_norms_into(field, bytes, @starts[i])
|
127
123
|
end
|
124
|
+
@norms_cache[field] = bytes # update cache
|
125
|
+
return bytes
|
128
126
|
end
|
127
|
+
end
|
129
128
|
|
130
|
-
|
129
|
+
def get_norms_into(field, buf, offset)
|
130
|
+
synchronize do
|
131
131
|
bytes = @norms_cache[field]
|
132
132
|
if (bytes != nil) # cache hit
|
133
133
|
buf[offset ,@max_doc] = bytes[0, @max_doc]
|
@@ -138,226 +138,226 @@ module Ferret
|
|
138
138
|
@sub_readers[i].get_norms_into(field, buf, offset + @starts[i])
|
139
139
|
end
|
140
140
|
end
|
141
|
+
end
|
141
142
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
143
|
+
def do_set_norm(n, field, value)
|
144
|
+
@norms_cache.delete(field) # clear cache
|
145
|
+
i = reader_index(n) # find segment num
|
146
|
+
@sub_readers[i].set_norm(n-@starts[i], field, value); # dispatch
|
147
|
+
end
|
147
148
|
|
148
|
-
|
149
|
-
|
150
|
-
|
149
|
+
def terms()
|
150
|
+
return MultiTermEnum.new(@sub_readers, @starts, nil)
|
151
|
+
end
|
151
152
|
|
152
|
-
|
153
|
-
|
154
|
-
|
153
|
+
def terms_from(term)
|
154
|
+
return MultiTermEnum.new(@sub_readers, @starts, term)
|
155
|
+
end
|
155
156
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
157
|
+
def doc_freq(t)
|
158
|
+
total = 0 # sum freqs in segments
|
159
|
+
@sub_readers.each {|reader| total += reader.doc_freq(t)}
|
160
|
+
return total
|
161
|
+
end
|
161
162
|
|
162
|
-
|
163
|
-
|
164
|
-
|
163
|
+
def term_docs()
|
164
|
+
return MultiTermDocEnum.new(@sub_readers, @starts)
|
165
|
+
end
|
165
166
|
|
166
|
-
|
167
|
-
|
168
|
-
|
167
|
+
def term_positions()
|
168
|
+
return MultiTermDocPosEnum.new(@sub_readers, @starts)
|
169
|
+
end
|
169
170
|
|
170
|
-
|
171
|
-
|
172
|
-
|
171
|
+
def do_commit()
|
172
|
+
@sub_readers.each {|reader| reader.commit() }
|
173
|
+
end
|
173
174
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
end
|
175
|
+
def do_close()
|
176
|
+
synchronize do
|
177
|
+
@sub_readers.each {|reader| reader.close() }
|
178
178
|
end
|
179
|
+
end
|
179
180
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
end
|
187
|
-
return field_set
|
181
|
+
# See IndexReader#get_field_names
|
182
|
+
def get_field_names(field_option = IndexReader::FieldOption::ALL)
|
183
|
+
# maintain a unique set of field names
|
184
|
+
field_set = Set.new
|
185
|
+
@sub_readers.each do |reader|
|
186
|
+
field_set |= reader.get_field_names(field_option)
|
188
187
|
end
|
188
|
+
return field_set
|
189
189
|
end
|
190
|
+
end
|
190
191
|
|
191
|
-
|
192
|
+
class MultiTermEnum < TermEnum
|
192
193
|
|
193
|
-
|
194
|
+
attr_reader :doc_freq, :term
|
194
195
|
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
end
|
205
|
-
smi = SegmentMergeInfo.new(starts[i], term_enum, reader)
|
206
|
-
|
207
|
-
if (t == nil and smi.next?) or term_enum.term
|
208
|
-
@queue.push(smi); # initialize queue
|
209
|
-
else
|
210
|
-
smi.close()
|
211
|
-
end
|
196
|
+
def initialize(readers, starts, t)
|
197
|
+
@queue = SegmentMergeQueue.new(readers.length)
|
198
|
+
readers.each_index do |i|
|
199
|
+
reader = readers[i]
|
200
|
+
term_enum = nil
|
201
|
+
if (t != nil)
|
202
|
+
term_enum = reader.terms_from(t)
|
203
|
+
else
|
204
|
+
term_enum = reader.terms()
|
212
205
|
end
|
206
|
+
smi = SegmentMergeInfo.new(starts[i], term_enum, reader)
|
213
207
|
|
214
|
-
if (t
|
215
|
-
|
208
|
+
if (t == nil and smi.next?) or term_enum.term
|
209
|
+
@queue.push(smi); # initialize queue
|
210
|
+
else
|
211
|
+
smi.close()
|
216
212
|
end
|
217
213
|
end
|
218
214
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
215
|
+
if (t != nil and @queue.size() > 0)
|
216
|
+
next?()
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
def next?()
|
221
|
+
top = @queue.top()
|
222
|
+
if (top == nil)
|
223
|
+
@term = nil
|
224
|
+
return false
|
225
|
+
end
|
225
226
|
|
226
|
-
|
227
|
-
|
227
|
+
@term = top.term
|
228
|
+
@doc_freq = 0
|
228
229
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
end
|
237
|
-
top = @queue.top()
|
230
|
+
while top and @term == top.term
|
231
|
+
@queue.pop()
|
232
|
+
@doc_freq += top.term_enum.doc_freq() # increment freq
|
233
|
+
if (top.next?)
|
234
|
+
@queue.push(top) # restore queue
|
235
|
+
else
|
236
|
+
top.close() # done with a segment
|
238
237
|
end
|
239
|
-
|
238
|
+
top = @queue.top()
|
240
239
|
end
|
240
|
+
return true
|
241
|
+
end
|
241
242
|
|
242
|
-
|
243
|
-
|
244
|
-
end
|
243
|
+
def close()
|
244
|
+
@queue.close()
|
245
245
|
end
|
246
|
+
end
|
246
247
|
|
247
|
-
|
248
|
-
|
248
|
+
class MultiTermDocEnum < TermDocEnum
|
249
|
+
attr_accessor :readers, :starts, :term, :base, :pointer, :current
|
249
250
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
251
|
+
def initialize(readers, starts)
|
252
|
+
@readers = readers
|
253
|
+
@starts = starts
|
254
|
+
@base = 0
|
255
|
+
@pointer = 0
|
255
256
|
|
256
|
-
|
257
|
-
|
257
|
+
@reader_term_docs = Array.new(readers.length)
|
258
|
+
end
|
258
259
|
|
259
|
-
|
260
|
-
|
261
|
-
|
260
|
+
def doc
|
261
|
+
return @base + @current.doc()
|
262
|
+
end
|
262
263
|
|
263
|
-
|
264
|
-
|
265
|
-
|
264
|
+
def freq
|
265
|
+
return @current.freq()
|
266
|
+
end
|
266
267
|
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
268
|
+
def seek(term)
|
269
|
+
@term = term
|
270
|
+
@base = 0
|
271
|
+
@pointer = 0
|
272
|
+
@current = nil
|
273
|
+
end
|
273
274
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
end
|
275
|
+
def next?
|
276
|
+
if @current and @current.next?
|
277
|
+
return true
|
278
|
+
elsif @pointer < @readers.length
|
279
|
+
@base = @starts[@pointer]
|
280
|
+
@current = term_docs(@pointer)
|
281
|
+
@pointer += 1
|
282
|
+
return next?()
|
283
|
+
else
|
284
|
+
return false
|
285
285
|
end
|
286
|
+
end
|
286
287
|
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
end
|
288
|
+
# Optimized implementation. Unlike the Java version, this method
|
289
|
+
# always returns as many results as it can read.
|
290
|
+
def read(docs, freqs)
|
291
|
+
got = 0
|
292
|
+
last_got = 0
|
293
|
+
needed = docs.length
|
294
|
+
|
295
|
+
while (true)
|
296
|
+
while @current.nil?
|
297
|
+
if @pointer < @readers.length # try next segment
|
298
|
+
@base = @starts[@pointer]
|
299
|
+
@current = term_docs(@pointer)
|
300
|
+
@pointer += 1
|
301
|
+
else
|
302
|
+
return got
|
303
303
|
end
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
304
|
+
end
|
305
|
+
got = @current.read(docs, freqs, got)
|
306
|
+
if (got == last_got) # none left in segment
|
307
|
+
@current = nil
|
308
|
+
else # got some
|
309
|
+
b = @base # adjust doc numbers
|
310
|
+
(last_got...got).each {|i| docs[i] += b}
|
311
|
+
if got == needed
|
312
|
+
return got
|
313
|
+
else
|
314
|
+
last_got = got
|
315
315
|
end
|
316
316
|
end
|
317
317
|
end
|
318
|
+
end
|
318
319
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
def term_docs(i)
|
328
|
-
return nil if (@term == nil)
|
329
|
-
result = @reader_term_docs[i]
|
330
|
-
if (result == nil)
|
331
|
-
result = @reader_term_docs[i] = term_docs_from_reader(@readers[i])
|
332
|
-
end
|
333
|
-
result.seek(@term)
|
334
|
-
return result
|
335
|
-
end
|
320
|
+
# As yet unoptimized implementation.
|
321
|
+
def skip_to(target)
|
322
|
+
begin
|
323
|
+
return false if not next?
|
324
|
+
end while target > doc()
|
325
|
+
return true
|
326
|
+
end
|
336
327
|
|
337
|
-
|
338
|
-
|
328
|
+
def term_docs(i)
|
329
|
+
return nil if (@term == nil)
|
330
|
+
result = @reader_term_docs[i]
|
331
|
+
if (result == nil)
|
332
|
+
result = @reader_term_docs[i] = term_docs_from_reader(@readers[i])
|
339
333
|
end
|
334
|
+
result.seek(@term)
|
335
|
+
return result
|
336
|
+
end
|
340
337
|
|
341
|
-
|
342
|
-
|
343
|
-
rtd.close()
|
344
|
-
end
|
345
|
-
end
|
338
|
+
def term_docs_from_reader(reader)
|
339
|
+
return reader.term_docs()
|
346
340
|
end
|
347
341
|
|
348
|
-
|
349
|
-
|
350
|
-
|
342
|
+
def close()
|
343
|
+
@reader_term_docs.compact.each do |rtd|
|
344
|
+
rtd.close()
|
351
345
|
end
|
346
|
+
end
|
347
|
+
end
|
352
348
|
|
353
|
-
|
354
|
-
|
355
|
-
|
349
|
+
class MultiTermDocPosEnum < MultiTermDocEnum
|
350
|
+
def initialize(r, s)
|
351
|
+
super(r,s)
|
352
|
+
end
|
356
353
|
|
357
|
-
|
358
|
-
|
359
|
-
|
354
|
+
def term_docs_from_reader(reader)
|
355
|
+
return reader.term_positions()
|
356
|
+
end
|
360
357
|
|
358
|
+
def next_position()
|
359
|
+
return @current.next_position()
|
361
360
|
end
|
361
|
+
|
362
362
|
end
|
363
363
|
end
|