ferret 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/TODO +3 -0
- data/ext/dummy.exe +0 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/token.rb +6 -0
- data/lib/ferret/analysis/tokenizers.rb +5 -5
- data/lib/ferret/document/document.rb +10 -13
- data/lib/ferret/index/compound_file_io.rb +12 -9
- data/lib/ferret/index/field_infos.rb +0 -6
- data/lib/ferret/index/index.rb +220 -102
- data/lib/ferret/index/index_reader.rb +22 -2
- data/lib/ferret/index/index_writer.rb +55 -14
- data/lib/ferret/index/multi_reader.rb +279 -279
- data/lib/ferret/index/segment_infos.rb +3 -3
- data/lib/ferret/index/segment_merger.rb +7 -6
- data/lib/ferret/index/segment_reader.rb +23 -7
- data/lib/ferret/index/segment_term_enum.rb +6 -7
- data/lib/ferret/index/term_buffer.rb +3 -5
- data/lib/ferret/index/term_doc_enum.rb +7 -2
- data/lib/ferret/index/term_infos_io.rb +15 -8
- data/lib/ferret/query_parser/query_parser.tab.rb +49 -45
- data/lib/ferret/search/boolean_query.rb +3 -4
- data/lib/ferret/search/boolean_scorer.rb +11 -11
- data/lib/ferret/search/caching_wrapper_filter.rb +1 -1
- data/lib/ferret/search/disjunction_sum_scorer.rb +9 -7
- data/lib/ferret/search/field_cache.rb +1 -2
- data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
- data/lib/ferret/search/fuzzy_term_enum.rb +64 -58
- data/lib/ferret/search/index_searcher.rb +16 -9
- data/lib/ferret/search/prefix_query.rb +7 -0
- data/lib/ferret/search/query_filter.rb +1 -1
- data/lib/ferret/search/term_scorer.rb +5 -1
- data/lib/ferret/search/top_docs.rb +12 -0
- data/lib/ferret/store/buffered_index_io.rb +5 -6
- data/lib/ferret/store/fs_store.rb +47 -33
- data/lib/ferret/store/ram_store.rb +2 -2
- data/lib/ferret/utils.rb +1 -0
- data/lib/ferret/utils/bit_vector.rb +20 -2
- data/lib/ferret/utils/thread_local.rb +28 -0
- data/lib/ferret/utils/weak_key_hash.rb +11 -2
- data/test/benchmark/tb_rw_vint.rb +1 -1
- data/test/functional/thread_safety_index_test.rb +81 -0
- data/test/functional/thread_safety_test.rb +137 -0
- data/test/test_all.rb +3 -7
- data/test/test_helper.rb +2 -1
- data/test/unit/index/tc_compound_file_io.rb +2 -2
- data/test/unit/index/tc_index.rb +128 -6
- data/test/unit/index/tc_index_reader.rb +1 -1
- data/test/unit/index/tc_segment_infos.rb +1 -1
- data/test/unit/index/th_doc.rb +1 -1
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/store/tc_fs_store.rb +3 -3
- data/test/unit/utils/tc_bit_vector.rb +8 -0
- data/test/unit/utils/tc_thread.rb +61 -0
- data/test/unit/utils/tc_weak_key_hash.rb +2 -2
- data/test/utils/number_to_spoken.rb +132 -0
- metadata +7 -2
@@ -62,8 +62,11 @@ module Ferret::Index
|
|
62
62
|
FieldOption.new("TERM_VECTOR_WITH_POSITION_OFFSET")
|
63
63
|
end
|
64
64
|
|
65
|
-
#
|
66
|
-
#
|
65
|
+
# To create an IndexReader use the IndexReader.open method. This method
|
66
|
+
# should only be used by subclasses.
|
67
|
+
#
|
68
|
+
# directory:: Directory where IndexReader files reside.
|
69
|
+
# segment_infos:: Used for write-l
|
67
70
|
# close_directory:: close the directory when the index reader is closed
|
68
71
|
def initialize(directory, segment_infos = nil,
|
69
72
|
close_directory = false, directory_owner = false)
|
@@ -81,7 +84,24 @@ module Ferret::Index
|
|
81
84
|
end
|
82
85
|
|
83
86
|
# Returns an index reader to read the index in the directory
|
87
|
+
#
|
88
|
+
# directory:: This can either be a Directory object or you can pass
|
89
|
+
# nil (RamDirectory is created) or a path (FSDirectory
|
90
|
+
# is created). If you chose the second or third options,
|
91
|
+
# you should leave close_directory as true and infos as
|
92
|
+
# nil.
|
93
|
+
# close_directory:: True if you want the IndexReader to close the
|
94
|
+
# directory when the IndexReader is closed. You'll want
|
95
|
+
# to set this to false if other objects are using the
|
96
|
+
# same directory object.
|
97
|
+
# infos:: Expert: This can be used to read an different version
|
98
|
+
# of the index but should really be left alone.
|
84
99
|
def IndexReader.open(directory, close_directory = true, infos = nil)
|
100
|
+
if directory.nil?
|
101
|
+
directory = Ferret::Store::RAMDirectory.new
|
102
|
+
elsif directory.is_a?(String)
|
103
|
+
directory = Ferret::Store::FSDirectory.new(directory, true)
|
104
|
+
end
|
85
105
|
directory.synchronize do # in- & inter-process sync
|
86
106
|
commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
|
87
107
|
commit_lock.while_locked() do
|
@@ -54,6 +54,9 @@ module Index
|
|
54
54
|
# NOTE:: all options are passed in a hash.
|
55
55
|
#
|
56
56
|
# dir:: the index directory
|
57
|
+
#
|
58
|
+
# == Options
|
59
|
+
#
|
57
60
|
# analyzer:: the analyzer to use. Defaults to StandardAnalyzer.
|
58
61
|
# create:: +true+ to create the index or overwrite the existing
|
59
62
|
# one +false+ to append to the existing index
|
@@ -62,17 +65,23 @@ module Index
|
|
62
65
|
# close_dir:: This specifies whether you would this class to close
|
63
66
|
# the index directory when this class is closed. The
|
64
67
|
# default is false.
|
65
|
-
|
68
|
+
# use_compound_file:: Use a compound file to store the index. This is
|
69
|
+
# slower than using multiple files but it prevents the
|
70
|
+
# too many files open error. This defaults to true.
|
71
|
+
def initialize(dir = nil, options = {})
|
66
72
|
super()
|
67
|
-
create = options[:create]||false
|
68
|
-
create_if_missing = options[:create_if_missing]||false
|
73
|
+
create = options[:create] || false
|
74
|
+
create_if_missing = options[:create_if_missing] || false
|
69
75
|
|
70
|
-
if dir.
|
71
|
-
@directory =
|
76
|
+
if dir.nil?
|
77
|
+
@directory = Ferret::Store::RAMDirectory.new
|
78
|
+
elsif dir.is_a?(String)
|
79
|
+
@directory = Ferret::Store::FSDirectory.new(dir, create)
|
72
80
|
else
|
73
81
|
@directory = dir
|
74
82
|
end
|
75
83
|
@close_dir = options[:close_dir] || false
|
84
|
+
@use_compound_file = (options[:use_compound_file] != false) # ie default true
|
76
85
|
@analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
|
77
86
|
@merge_factor = DEFAULT_MERGE_FACTOR
|
78
87
|
@min_merge_docs = DEFAULT_MIN_MERGE_DOCS
|
@@ -108,6 +117,8 @@ module Index
|
|
108
117
|
end
|
109
118
|
end
|
110
119
|
end
|
120
|
+
|
121
|
+
@info_stream = nil
|
111
122
|
end
|
112
123
|
|
113
124
|
# Flushes all changes to an index and closes all associated files.
|
@@ -125,11 +136,11 @@ module Index
|
|
125
136
|
|
126
137
|
# Returns the number of documents currently in this index.
|
127
138
|
def doc_count()
|
128
|
-
count = 0
|
129
139
|
synchronize() do
|
140
|
+
count = 0
|
130
141
|
@segment_infos.each { |si| count += si.doc_count() }
|
142
|
+
return count
|
131
143
|
end
|
132
|
-
return count
|
133
144
|
end
|
134
145
|
|
135
146
|
# Adds a document to this index, using the provided analyzer instead of the
|
@@ -223,7 +234,7 @@ module Index
|
|
223
234
|
merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
|
224
235
|
|
225
236
|
if (@segment_infos.size() == 1) # add existing index, if any
|
226
|
-
s_reader = SegmentReader.
|
237
|
+
s_reader = SegmentReader.get(@segment_infos[0])
|
227
238
|
merger << s_reader
|
228
239
|
segments_to_delete << s_reader
|
229
240
|
end
|
@@ -232,7 +243,7 @@ module Index
|
|
232
243
|
merger << reader
|
233
244
|
end
|
234
245
|
|
235
|
-
doc_count = merger.merge
|
246
|
+
doc_count = merger.merge() # merge 'em
|
236
247
|
|
237
248
|
@segment_infos.clear() # pop old infos & add new
|
238
249
|
@segment_infos << SegmentInfo.new(merged_name, doc_count, @directory)
|
@@ -241,9 +252,22 @@ module Index
|
|
241
252
|
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
242
253
|
@segment_infos.write(@directory) # commit changes
|
243
254
|
delete_segments(segments_to_delete)
|
244
|
-
return nil
|
245
255
|
end
|
246
256
|
end
|
257
|
+
|
258
|
+
if @use_compound_file
|
259
|
+
files_to_delete = merger.create_compound_file(merged_name + ".tmp")
|
260
|
+
@directory.synchronize() do # in- & inter-process sync
|
261
|
+
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
262
|
+
# make compound file visible for SegmentReaders
|
263
|
+
@directory.rename(merged_name + ".tmp", merged_name + ".cfs")
|
264
|
+
# delete now unused files of segment
|
265
|
+
delete_files_and_write_undeletable(files_to_delete)
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
optimize()
|
247
271
|
end
|
248
272
|
end
|
249
273
|
|
@@ -379,11 +403,10 @@ module Index
|
|
379
403
|
merged_doc_count = merger.merge()
|
380
404
|
|
381
405
|
if (@info_stream != nil)
|
382
|
-
@info_stream.print(" into
|
406
|
+
@info_stream.print(" into #{merged_name} (#{merged_doc_count.to_s} docs)\n")
|
383
407
|
end
|
384
408
|
|
385
409
|
(max_segment-1).downto(min_segment) {|i| @segment_infos.delete_at(i) }
|
386
|
-
#@segment_infos = @segment_infos[0,min_segment] + @segment_infos[max_segment...-1]
|
387
410
|
|
388
411
|
@segment_infos << SegmentInfo.new(merged_name, merged_doc_count, @directory)
|
389
412
|
|
@@ -394,10 +417,21 @@ module Index
|
|
394
417
|
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
395
418
|
@segment_infos.write(@directory) # commit before deleting
|
396
419
|
delete_segments(segments_to_delete) # delete now-unused segments
|
397
|
-
return nil
|
398
420
|
end
|
399
421
|
end
|
400
|
-
|
422
|
+
|
423
|
+
if @use_compound_file
|
424
|
+
files_to_delete = merger.create_compound_file(merged_name + ".tmp")
|
425
|
+
@directory.synchronize() do # in- & inter-process sync
|
426
|
+
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
427
|
+
# make compound file visible for SegmentReaders
|
428
|
+
@directory.rename(merged_name + ".tmp", merged_name + ".cfs")
|
429
|
+
# delete now unused files of segment
|
430
|
+
delete_files_and_write_undeletable(files_to_delete)
|
431
|
+
end
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
401
435
|
end
|
402
436
|
|
403
437
|
# Some operating systems (e.g. Windows) don't permit a file to be
|
@@ -440,6 +474,13 @@ module Index
|
|
440
474
|
|
441
475
|
end
|
442
476
|
|
477
|
+
def delete_files_and_write_undeletable(files)
|
478
|
+
deletable = []
|
479
|
+
try_to_delete_files(read_deleteable_files(), deletable) # try to delete deleteable
|
480
|
+
try_to_delete_files(files, deletable) # try to delete our files
|
481
|
+
write_deleteable_files(deletable) # note files we can't delete
|
482
|
+
end
|
483
|
+
|
443
484
|
def delete_files(file_names, dir)
|
444
485
|
file_names.each do |file_name|
|
445
486
|
dir.delete(file_name)
|
@@ -1,133 +1,133 @@
|
|
1
|
-
module Ferret
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
1
|
+
module Ferret::Index
|
2
|
+
# An IndexReader which reads multiple indexes, appending their content.
|
3
|
+
class MultiReader < IndexReader
|
4
|
+
attr_reader :max_doc
|
5
|
+
|
6
|
+
# Construct a MultiReader aggregating the named set of (sub)readers.
|
7
|
+
# Directory locking for delete, undeleteAll, and set_norm operations is
|
8
|
+
# left to the subreaders.
|
9
|
+
#
|
10
|
+
# Note that all subreaders are closed if this Multireader is closed.
|
11
|
+
# sub_readers:: set of (sub)readers
|
12
|
+
# raises:: IOException
|
13
|
+
def initialize(sub_readers, directory = nil, sis = nil, close_dir = false)
|
14
|
+
if (directory)
|
15
|
+
super(directory, sis, close_dir)
|
16
|
+
else
|
17
|
+
super(sub_readers.length == 0 ? nil : sub_readers[0].directory())
|
18
|
+
end
|
19
|
+
|
20
|
+
@max_doc = 0
|
21
|
+
@num_docs = -1
|
22
|
+
@has_deletions = false
|
23
|
+
|
24
|
+
@sub_readers = sub_readers
|
25
|
+
@starts = Array.new(@sub_readers.length + 1) # build starts array
|
26
|
+
@sub_readers.each_with_index do |sub_reader, i|
|
27
|
+
@starts[i] = @max_doc
|
28
|
+
@max_doc += sub_reader.max_doc # compute max_docs
|
29
|
+
|
30
|
+
if @sub_readers[i].has_deletions?
|
31
|
+
@has_deletions = true
|
19
32
|
end
|
20
|
-
|
21
|
-
@max_doc = 0
|
22
|
-
@num_docs = -1
|
23
|
-
@has_deletions = false
|
24
|
-
|
25
|
-
@sub_readers = sub_readers
|
26
|
-
@starts = Array.new(@sub_readers.length + 1) # build starts array
|
27
|
-
@sub_readers.each_with_index do |sub_reader, i|
|
28
|
-
@starts[i] = @max_doc
|
29
|
-
@max_doc += sub_reader.max_doc # compute maxDocs
|
30
|
-
|
31
|
-
if @sub_readers[i].has_deletions?
|
32
|
-
@has_deletions = true
|
33
|
-
end
|
34
|
-
end
|
35
|
-
@starts[@sub_readers.length] = @max_doc
|
36
|
-
@norms_cache = {}
|
37
33
|
end
|
34
|
+
@starts[@sub_readers.length] = @max_doc
|
35
|
+
@norms_cache = {}
|
36
|
+
end
|
38
37
|
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
39
|
+
# Return an array of term frequency vectors for the specified document. The
|
40
|
+
# array contains a vector for each vectorized field in the document. Each
|
41
|
+
# vector vector contains term numbers and frequencies for all terms in a
|
42
|
+
# given vectorized field. If no such fields existed, the method returns
|
43
|
+
# nil.
|
44
|
+
def get_term_vectors(n)
|
45
|
+
i = reader_index(n) # find segment num
|
46
|
+
return @sub_readers[i].get_term_vectors(n - @starts[i]); # dispatch to segment
|
47
|
+
end
|
49
48
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
49
|
+
def get_term_vector(n, field)
|
50
|
+
i = reader_index(n) # find segment num
|
51
|
+
return @sub_readers[i].get_term_vector(n - @starts[i], field)
|
52
|
+
end
|
54
53
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
end
|
62
|
-
return @num_docs
|
54
|
+
def num_docs()
|
55
|
+
synchronize do
|
56
|
+
if (@num_docs == -1) # check cache
|
57
|
+
n = 0 # cache miss -= 1recompute
|
58
|
+
@sub_readers.each {|reader| n += reader.num_docs()}
|
59
|
+
@num_docs = n
|
63
60
|
end
|
61
|
+
return @num_docs
|
64
62
|
end
|
63
|
+
end
|
65
64
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
65
|
+
def get_document(n)
|
66
|
+
i = reader_index(n) # find segment num
|
67
|
+
return @sub_readers[i].get_document(n - @starts[i]) # dispatch to segment reader
|
68
|
+
end
|
70
69
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
70
|
+
def deleted?(n)
|
71
|
+
i = reader_index(n) # find segment num
|
72
|
+
return @sub_readers[i].deleted?(n - @starts[i]) # dispatch to segment reader
|
73
|
+
end
|
75
74
|
|
76
|
-
|
77
|
-
|
78
|
-
|
75
|
+
def has_deletions?()
|
76
|
+
return @has_deletions
|
77
|
+
end
|
79
78
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
79
|
+
def do_delete(n)
|
80
|
+
@num_docs = -1 # invalidate cache
|
81
|
+
i = reader_index(n) # find segment num
|
82
|
+
@sub_readers[i].delete(n - @starts[i]) # dispatch to segment reader
|
83
|
+
@has_deletions = true
|
84
|
+
end
|
86
85
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
86
|
+
def do_undelete_all()
|
87
|
+
@num_docs = -1 # invalidate cache
|
88
|
+
@sub_readers.each {|reader| reader.undelete_all() }
|
89
|
+
@has_deletions = false
|
90
|
+
end
|
92
91
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
end
|
108
|
-
return mid
|
92
|
+
def reader_index(n) # find reader for doc n:
|
93
|
+
lo = 0 # search @starts array
|
94
|
+
hi = @sub_readers.length - 1 # for first element less
|
95
|
+
|
96
|
+
while (hi >= lo)
|
97
|
+
mid = (lo + hi) >> 1
|
98
|
+
mid_value = @starts[mid]
|
99
|
+
if (n < mid_value)
|
100
|
+
hi = mid - 1
|
101
|
+
elsif (n > mid_value)
|
102
|
+
lo = mid + 1
|
103
|
+
else # found a match
|
104
|
+
while (mid+1 < @sub_readers.length and @starts[mid+1] == mid_value)
|
105
|
+
mid += 1 # scan to last match
|
109
106
|
end
|
107
|
+
return mid
|
110
108
|
end
|
111
|
-
return hi
|
112
109
|
end
|
110
|
+
return hi
|
111
|
+
end
|
113
112
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
113
|
+
def get_norms(field)
|
114
|
+
synchronize do
|
115
|
+
bytes = @norms_cache[field]
|
116
|
+
if (bytes != nil)
|
117
|
+
return bytes # cache hit
|
118
|
+
end
|
120
119
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
end
|
125
|
-
@norms_cache[field] = bytes # update cache
|
126
|
-
return bytes
|
120
|
+
bytes = " " * @max_doc
|
121
|
+
@sub_readers.length.times do |i|
|
122
|
+
@sub_readers[i].get_norms_into(field, bytes, @starts[i])
|
127
123
|
end
|
124
|
+
@norms_cache[field] = bytes # update cache
|
125
|
+
return bytes
|
128
126
|
end
|
127
|
+
end
|
129
128
|
|
130
|
-
|
129
|
+
def get_norms_into(field, buf, offset)
|
130
|
+
synchronize do
|
131
131
|
bytes = @norms_cache[field]
|
132
132
|
if (bytes != nil) # cache hit
|
133
133
|
buf[offset ,@max_doc] = bytes[0, @max_doc]
|
@@ -138,226 +138,226 @@ module Ferret
|
|
138
138
|
@sub_readers[i].get_norms_into(field, buf, offset + @starts[i])
|
139
139
|
end
|
140
140
|
end
|
141
|
+
end
|
141
142
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
143
|
+
def do_set_norm(n, field, value)
|
144
|
+
@norms_cache.delete(field) # clear cache
|
145
|
+
i = reader_index(n) # find segment num
|
146
|
+
@sub_readers[i].set_norm(n-@starts[i], field, value); # dispatch
|
147
|
+
end
|
147
148
|
|
148
|
-
|
149
|
-
|
150
|
-
|
149
|
+
def terms()
|
150
|
+
return MultiTermEnum.new(@sub_readers, @starts, nil)
|
151
|
+
end
|
151
152
|
|
152
|
-
|
153
|
-
|
154
|
-
|
153
|
+
def terms_from(term)
|
154
|
+
return MultiTermEnum.new(@sub_readers, @starts, term)
|
155
|
+
end
|
155
156
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
157
|
+
def doc_freq(t)
|
158
|
+
total = 0 # sum freqs in segments
|
159
|
+
@sub_readers.each {|reader| total += reader.doc_freq(t)}
|
160
|
+
return total
|
161
|
+
end
|
161
162
|
|
162
|
-
|
163
|
-
|
164
|
-
|
163
|
+
def term_docs()
|
164
|
+
return MultiTermDocEnum.new(@sub_readers, @starts)
|
165
|
+
end
|
165
166
|
|
166
|
-
|
167
|
-
|
168
|
-
|
167
|
+
def term_positions()
|
168
|
+
return MultiTermDocPosEnum.new(@sub_readers, @starts)
|
169
|
+
end
|
169
170
|
|
170
|
-
|
171
|
-
|
172
|
-
|
171
|
+
def do_commit()
|
172
|
+
@sub_readers.each {|reader| reader.commit() }
|
173
|
+
end
|
173
174
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
end
|
175
|
+
def do_close()
|
176
|
+
synchronize do
|
177
|
+
@sub_readers.each {|reader| reader.close() }
|
178
178
|
end
|
179
|
+
end
|
179
180
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
end
|
187
|
-
return field_set
|
181
|
+
# See IndexReader#get_field_names
|
182
|
+
def get_field_names(field_option = IndexReader::FieldOption::ALL)
|
183
|
+
# maintain a unique set of field names
|
184
|
+
field_set = Set.new
|
185
|
+
@sub_readers.each do |reader|
|
186
|
+
field_set |= reader.get_field_names(field_option)
|
188
187
|
end
|
188
|
+
return field_set
|
189
189
|
end
|
190
|
+
end
|
190
191
|
|
191
|
-
|
192
|
+
class MultiTermEnum < TermEnum
|
192
193
|
|
193
|
-
|
194
|
+
attr_reader :doc_freq, :term
|
194
195
|
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
end
|
205
|
-
smi = SegmentMergeInfo.new(starts[i], term_enum, reader)
|
206
|
-
|
207
|
-
if (t == nil and smi.next?) or term_enum.term
|
208
|
-
@queue.push(smi); # initialize queue
|
209
|
-
else
|
210
|
-
smi.close()
|
211
|
-
end
|
196
|
+
def initialize(readers, starts, t)
|
197
|
+
@queue = SegmentMergeQueue.new(readers.length)
|
198
|
+
readers.each_index do |i|
|
199
|
+
reader = readers[i]
|
200
|
+
term_enum = nil
|
201
|
+
if (t != nil)
|
202
|
+
term_enum = reader.terms_from(t)
|
203
|
+
else
|
204
|
+
term_enum = reader.terms()
|
212
205
|
end
|
206
|
+
smi = SegmentMergeInfo.new(starts[i], term_enum, reader)
|
213
207
|
|
214
|
-
if (t
|
215
|
-
|
208
|
+
if (t == nil and smi.next?) or term_enum.term
|
209
|
+
@queue.push(smi); # initialize queue
|
210
|
+
else
|
211
|
+
smi.close()
|
216
212
|
end
|
217
213
|
end
|
218
214
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
215
|
+
if (t != nil and @queue.size() > 0)
|
216
|
+
next?()
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
def next?()
|
221
|
+
top = @queue.top()
|
222
|
+
if (top == nil)
|
223
|
+
@term = nil
|
224
|
+
return false
|
225
|
+
end
|
225
226
|
|
226
|
-
|
227
|
-
|
227
|
+
@term = top.term
|
228
|
+
@doc_freq = 0
|
228
229
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
end
|
237
|
-
top = @queue.top()
|
230
|
+
while top and @term == top.term
|
231
|
+
@queue.pop()
|
232
|
+
@doc_freq += top.term_enum.doc_freq() # increment freq
|
233
|
+
if (top.next?)
|
234
|
+
@queue.push(top) # restore queue
|
235
|
+
else
|
236
|
+
top.close() # done with a segment
|
238
237
|
end
|
239
|
-
|
238
|
+
top = @queue.top()
|
240
239
|
end
|
240
|
+
return true
|
241
|
+
end
|
241
242
|
|
242
|
-
|
243
|
-
|
244
|
-
end
|
243
|
+
def close()
|
244
|
+
@queue.close()
|
245
245
|
end
|
246
|
+
end
|
246
247
|
|
247
|
-
|
248
|
-
|
248
|
+
class MultiTermDocEnum < TermDocEnum
|
249
|
+
attr_accessor :readers, :starts, :term, :base, :pointer, :current
|
249
250
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
251
|
+
def initialize(readers, starts)
|
252
|
+
@readers = readers
|
253
|
+
@starts = starts
|
254
|
+
@base = 0
|
255
|
+
@pointer = 0
|
255
256
|
|
256
|
-
|
257
|
-
|
257
|
+
@reader_term_docs = Array.new(readers.length)
|
258
|
+
end
|
258
259
|
|
259
|
-
|
260
|
-
|
261
|
-
|
260
|
+
def doc
|
261
|
+
return @base + @current.doc()
|
262
|
+
end
|
262
263
|
|
263
|
-
|
264
|
-
|
265
|
-
|
264
|
+
def freq
|
265
|
+
return @current.freq()
|
266
|
+
end
|
266
267
|
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
268
|
+
def seek(term)
|
269
|
+
@term = term
|
270
|
+
@base = 0
|
271
|
+
@pointer = 0
|
272
|
+
@current = nil
|
273
|
+
end
|
273
274
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
end
|
275
|
+
def next?
|
276
|
+
if @current and @current.next?
|
277
|
+
return true
|
278
|
+
elsif @pointer < @readers.length
|
279
|
+
@base = @starts[@pointer]
|
280
|
+
@current = term_docs(@pointer)
|
281
|
+
@pointer += 1
|
282
|
+
return next?()
|
283
|
+
else
|
284
|
+
return false
|
285
285
|
end
|
286
|
+
end
|
286
287
|
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
end
|
288
|
+
# Optimized implementation. Unlike the Java version, this method
|
289
|
+
# always returns as many results as it can read.
|
290
|
+
def read(docs, freqs)
|
291
|
+
got = 0
|
292
|
+
last_got = 0
|
293
|
+
needed = docs.length
|
294
|
+
|
295
|
+
while (true)
|
296
|
+
while @current.nil?
|
297
|
+
if @pointer < @readers.length # try next segment
|
298
|
+
@base = @starts[@pointer]
|
299
|
+
@current = term_docs(@pointer)
|
300
|
+
@pointer += 1
|
301
|
+
else
|
302
|
+
return got
|
303
303
|
end
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
304
|
+
end
|
305
|
+
got = @current.read(docs, freqs, got)
|
306
|
+
if (got == last_got) # none left in segment
|
307
|
+
@current = nil
|
308
|
+
else # got some
|
309
|
+
b = @base # adjust doc numbers
|
310
|
+
(last_got...got).each {|i| docs[i] += b}
|
311
|
+
if got == needed
|
312
|
+
return got
|
313
|
+
else
|
314
|
+
last_got = got
|
315
315
|
end
|
316
316
|
end
|
317
317
|
end
|
318
|
+
end
|
318
319
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
def term_docs(i)
|
328
|
-
return nil if (@term == nil)
|
329
|
-
result = @reader_term_docs[i]
|
330
|
-
if (result == nil)
|
331
|
-
result = @reader_term_docs[i] = term_docs_from_reader(@readers[i])
|
332
|
-
end
|
333
|
-
result.seek(@term)
|
334
|
-
return result
|
335
|
-
end
|
320
|
+
# As yet unoptimized implementation.
|
321
|
+
def skip_to(target)
|
322
|
+
begin
|
323
|
+
return false if not next?
|
324
|
+
end while target > doc()
|
325
|
+
return true
|
326
|
+
end
|
336
327
|
|
337
|
-
|
338
|
-
|
328
|
+
def term_docs(i)
|
329
|
+
return nil if (@term == nil)
|
330
|
+
result = @reader_term_docs[i]
|
331
|
+
if (result == nil)
|
332
|
+
result = @reader_term_docs[i] = term_docs_from_reader(@readers[i])
|
339
333
|
end
|
334
|
+
result.seek(@term)
|
335
|
+
return result
|
336
|
+
end
|
340
337
|
|
341
|
-
|
342
|
-
|
343
|
-
rtd.close()
|
344
|
-
end
|
345
|
-
end
|
338
|
+
def term_docs_from_reader(reader)
|
339
|
+
return reader.term_docs()
|
346
340
|
end
|
347
341
|
|
348
|
-
|
349
|
-
|
350
|
-
|
342
|
+
def close()
|
343
|
+
@reader_term_docs.compact.each do |rtd|
|
344
|
+
rtd.close()
|
351
345
|
end
|
346
|
+
end
|
347
|
+
end
|
352
348
|
|
353
|
-
|
354
|
-
|
355
|
-
|
349
|
+
class MultiTermDocPosEnum < MultiTermDocEnum
|
350
|
+
def initialize(r, s)
|
351
|
+
super(r,s)
|
352
|
+
end
|
356
353
|
|
357
|
-
|
358
|
-
|
359
|
-
|
354
|
+
def term_docs_from_reader(reader)
|
355
|
+
return reader.term_positions()
|
356
|
+
end
|
360
357
|
|
358
|
+
def next_position()
|
359
|
+
return @current.next_position()
|
361
360
|
end
|
361
|
+
|
362
362
|
end
|
363
363
|
end
|