ferret 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/TODO +3 -0
- data/ext/dummy.exe +0 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/token.rb +6 -0
- data/lib/ferret/analysis/tokenizers.rb +5 -5
- data/lib/ferret/document/document.rb +10 -13
- data/lib/ferret/index/compound_file_io.rb +12 -9
- data/lib/ferret/index/field_infos.rb +0 -6
- data/lib/ferret/index/index.rb +220 -102
- data/lib/ferret/index/index_reader.rb +22 -2
- data/lib/ferret/index/index_writer.rb +55 -14
- data/lib/ferret/index/multi_reader.rb +279 -279
- data/lib/ferret/index/segment_infos.rb +3 -3
- data/lib/ferret/index/segment_merger.rb +7 -6
- data/lib/ferret/index/segment_reader.rb +23 -7
- data/lib/ferret/index/segment_term_enum.rb +6 -7
- data/lib/ferret/index/term_buffer.rb +3 -5
- data/lib/ferret/index/term_doc_enum.rb +7 -2
- data/lib/ferret/index/term_infos_io.rb +15 -8
- data/lib/ferret/query_parser/query_parser.tab.rb +49 -45
- data/lib/ferret/search/boolean_query.rb +3 -4
- data/lib/ferret/search/boolean_scorer.rb +11 -11
- data/lib/ferret/search/caching_wrapper_filter.rb +1 -1
- data/lib/ferret/search/disjunction_sum_scorer.rb +9 -7
- data/lib/ferret/search/field_cache.rb +1 -2
- data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
- data/lib/ferret/search/fuzzy_term_enum.rb +64 -58
- data/lib/ferret/search/index_searcher.rb +16 -9
- data/lib/ferret/search/prefix_query.rb +7 -0
- data/lib/ferret/search/query_filter.rb +1 -1
- data/lib/ferret/search/term_scorer.rb +5 -1
- data/lib/ferret/search/top_docs.rb +12 -0
- data/lib/ferret/store/buffered_index_io.rb +5 -6
- data/lib/ferret/store/fs_store.rb +47 -33
- data/lib/ferret/store/ram_store.rb +2 -2
- data/lib/ferret/utils.rb +1 -0
- data/lib/ferret/utils/bit_vector.rb +20 -2
- data/lib/ferret/utils/thread_local.rb +28 -0
- data/lib/ferret/utils/weak_key_hash.rb +11 -2
- data/test/benchmark/tb_rw_vint.rb +1 -1
- data/test/functional/thread_safety_index_test.rb +81 -0
- data/test/functional/thread_safety_test.rb +137 -0
- data/test/test_all.rb +3 -7
- data/test/test_helper.rb +2 -1
- data/test/unit/index/tc_compound_file_io.rb +2 -2
- data/test/unit/index/tc_index.rb +128 -6
- data/test/unit/index/tc_index_reader.rb +1 -1
- data/test/unit/index/tc_segment_infos.rb +1 -1
- data/test/unit/index/th_doc.rb +1 -1
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/store/tc_fs_store.rb +3 -3
- data/test/unit/utils/tc_bit_vector.rb +8 -0
- data/test/unit/utils/tc_thread.rb +61 -0
- data/test/unit/utils/tc_weak_key_hash.rb +2 -2
- data/test/utils/number_to_spoken.rb +132 -0
- metadata +7 -2
data/Rakefile
CHANGED
@@ -196,7 +196,7 @@ end
|
|
196
196
|
|
197
197
|
desc "Make a new release"
|
198
198
|
task :prerelease => [:clobber, :all_tests, :parsers]
|
199
|
-
|
199
|
+
task :package => [:prerelease]
|
200
200
|
task :tag => [:prerelease]
|
201
201
|
task :update_version => [:prerelease]
|
202
202
|
task :release => [:tag, :update_version, :package] do
|
data/TODO
CHANGED
@@ -5,8 +5,11 @@ Send suggestions for this list to mailto:dbalmain@gmail.com
|
|
5
5
|
=== To Do
|
6
6
|
|
7
7
|
* Add the ability to persist an in memory index to Ferret::Index::Index
|
8
|
+
* Make a dll for people on Windows
|
8
9
|
|
9
10
|
=== Done
|
10
11
|
|
11
12
|
* Add UTF-8 support
|
12
13
|
* Multi Field Query
|
14
|
+
* Test threading
|
15
|
+
* Compile a proper dummy executable
|
data/ext/dummy.exe
CHANGED
Binary file
|
data/lib/ferret.rb
CHANGED
@@ -35,6 +35,12 @@ module Ferret::Analysis
|
|
35
35
|
@position_increment = pos_inc
|
36
36
|
end
|
37
37
|
|
38
|
+
def eql?(o)
|
39
|
+
return (o.instance_of?(Token) and @start_offset == o.start_offset and
|
40
|
+
@end_offset == o.end_offset and @term_text = o.term_text)
|
41
|
+
end
|
42
|
+
alias :== :eql?
|
43
|
+
|
38
44
|
# Tokens are sorted by the position in the text at which they occur, ie
|
39
45
|
# the start_offset. If two tokens have the same start offset, (see
|
40
46
|
# position_increment=) then, they are sorted by the end_offset and then
|
@@ -26,7 +26,7 @@ module Ferret::Analysis
|
|
26
26
|
#
|
27
27
|
# class LetterTokenizer < RegExpTokenizer
|
28
28
|
# def token_re()
|
29
|
-
# /[
|
29
|
+
# /[[:alpha:]]+/
|
30
30
|
# end
|
31
31
|
# end
|
32
32
|
class RegExpTokenizer < Tokenizer
|
@@ -63,7 +63,7 @@ module Ferret::Analysis
|
|
63
63
|
protected
|
64
64
|
# returns the regular expression used to find the next token
|
65
65
|
def token_re
|
66
|
-
/[
|
66
|
+
/[[:alpha:]]+/
|
67
67
|
end
|
68
68
|
|
69
69
|
# Called on each token to normalize it before it is added to the
|
@@ -75,13 +75,13 @@ module Ferret::Analysis
|
|
75
75
|
|
76
76
|
# A LetterTokenizer is a tokenizer that divides text at non-letters.
|
77
77
|
# That's to say, it defines tokens as maximal strings of adjacent letters,
|
78
|
-
# as defined by the regular expression _/[
|
78
|
+
# as defined by the regular expression _/[[:alpha:]]+/_.
|
79
79
|
class LetterTokenizer < RegExpTokenizer
|
80
80
|
protected
|
81
81
|
# Collects only characters which satisfy the regular expression
|
82
|
-
# _/[
|
82
|
+
# _/[[:alpha:]]+/_.
|
83
83
|
def token_re()
|
84
|
-
/[
|
84
|
+
/[[:alpha:]]+/
|
85
85
|
end
|
86
86
|
end
|
87
87
|
|
@@ -69,13 +69,13 @@ module Ferret::Document
|
|
69
69
|
# document has to be deleted from an index and a new changed version of
|
70
70
|
# that document has to be added.
|
71
71
|
def add_field(field)
|
72
|
-
(@fields[field.name] ||= []) << field
|
72
|
+
(@fields[field.name.to_s] ||= []) << field
|
73
73
|
end
|
74
74
|
alias :<< :add_field
|
75
75
|
|
76
76
|
# Removes the first field of this name if it exists.
|
77
77
|
def remove_field(name)
|
78
|
-
@fields[name].delete_at(0)
|
78
|
+
@fields[name.to_s].delete_at(0)
|
79
79
|
end
|
80
80
|
|
81
81
|
# Removes all fields with the given name from the document.
|
@@ -89,7 +89,7 @@ module Ferret::Document
|
|
89
89
|
# this, a document has to be deleted from an index and a new changed
|
90
90
|
# version of that document has to be added.
|
91
91
|
def remove_fields(name)
|
92
|
-
@fields.delete(name)
|
92
|
+
@fields.delete(name.to_s)
|
93
93
|
end
|
94
94
|
|
95
95
|
# Returns the first field with the given name.
|
@@ -98,7 +98,7 @@ module Ferret::Document
|
|
98
98
|
# name:: the name of the field
|
99
99
|
# Return:: a _Field_ array
|
100
100
|
def field(name)
|
101
|
-
@fields[name] ? @fields[name][0] : nil
|
101
|
+
@fields[name.to_s] ? @fields[name.to_s][0] : nil
|
102
102
|
end
|
103
103
|
|
104
104
|
# Returns an array of all fields with the given name.
|
@@ -107,7 +107,7 @@ module Ferret::Document
|
|
107
107
|
# name:: the name of the field
|
108
108
|
# Return:: a _Field_ array
|
109
109
|
def fields(name)
|
110
|
-
@fields[name]
|
110
|
+
@fields[name.to_s]
|
111
111
|
end
|
112
112
|
|
113
113
|
# Returns an array of values of the field specified as the method
|
@@ -116,8 +116,8 @@ module Ferret::Document
|
|
116
116
|
# name:: the name of the field
|
117
117
|
# Return:: a _String_ of field values
|
118
118
|
def values(name)
|
119
|
-
return nil if @fields[name].nil?
|
120
|
-
@fields[name].map {|f| f.data if not f.binary? }.join(" ")
|
119
|
+
return nil if @fields[name.to_s].nil?
|
120
|
+
@fields[name.to_s].map {|f| f.data if not f.binary? }.join(" ")
|
121
121
|
end
|
122
122
|
alias :[] :values
|
123
123
|
|
@@ -125,7 +125,7 @@ module Ferret::Document
|
|
125
125
|
# field of that name then it will set the data in the first field of that
|
126
126
|
# name.
|
127
127
|
def []=(field_name, data)
|
128
|
-
field = field(field_name)
|
128
|
+
field = field(field_name.to_s)
|
129
129
|
raise ArgumentError, "Field does not exist" unless field
|
130
130
|
field.data = data
|
131
131
|
end
|
@@ -137,16 +137,13 @@ module Ferret::Document
|
|
137
137
|
# Return:: a _String_ of field values
|
138
138
|
def binaries(name)
|
139
139
|
binaries = []
|
140
|
-
@fields[name].each {|f| binaries << f.data if f.binary? }
|
140
|
+
@fields[name.to_s].each {|f| binaries << f.data if f.binary? }
|
141
141
|
return binaries
|
142
142
|
end
|
143
143
|
|
144
144
|
# Prints the fields of a document for human consumption.#/
|
145
145
|
def to_s()
|
146
|
-
|
147
|
-
@fields.each_key { |name| field_str += name + " " }
|
148
|
-
field_str[-1] = ">"
|
149
|
-
return "Document<" + field_str
|
146
|
+
return "Document<#{@fields.keys.join(" ")}>"
|
150
147
|
end
|
151
148
|
end
|
152
149
|
end
|
@@ -92,7 +92,7 @@ module Ferret::Index
|
|
92
92
|
end
|
93
93
|
|
94
94
|
# Returns true iff a file with the given name exists.
|
95
|
-
def
|
95
|
+
def exists?(name)
|
96
96
|
return @entries.key?(name)
|
97
97
|
end
|
98
98
|
|
@@ -113,7 +113,7 @@ module Ferret::Index
|
|
113
113
|
def rename(from, to) raise(UnsupportedOperationError) end
|
114
114
|
|
115
115
|
# Returns the length of a file in the directory.
|
116
|
-
def
|
116
|
+
def length(name)
|
117
117
|
e = @entries[name]
|
118
118
|
if (e == nil): raise(IOError, "File " + name + " does not exist") end
|
119
119
|
return e.length
|
@@ -188,6 +188,9 @@ module Ferret::Index
|
|
188
188
|
# data section, and a UTF String with that file's extension.
|
189
189
|
class CompoundFileWriter
|
190
190
|
|
191
|
+
class StateError < Exception
|
192
|
+
end
|
193
|
+
|
191
194
|
attr_reader :directory, :file_name
|
192
195
|
|
193
196
|
# Create the compound stream in the specified file. The file name is the
|
@@ -203,16 +206,16 @@ module Ferret::Index
|
|
203
206
|
# Add a source stream. _file_name_ is the string by which the
|
204
207
|
# sub-stream will be known in the compound stream.
|
205
208
|
#
|
206
|
-
# Throws::
|
207
|
-
# Throws::
|
209
|
+
# Throws:: StateError if this writer is closed
|
210
|
+
# Throws:: ArgumentError if a file with the same name
|
208
211
|
# has been added already
|
209
212
|
def add_file(file_name)
|
210
213
|
if @merged
|
211
|
-
raise(
|
214
|
+
raise(StateError, "Can't add extensions after merge has been called")
|
212
215
|
end
|
213
216
|
|
214
217
|
if not @ids.add?(file_name)
|
215
|
-
raise(
|
218
|
+
raise(ArgumentError, "File #{file_name} already added")
|
216
219
|
end
|
217
220
|
|
218
221
|
entry = FileEntry.new(file_name)
|
@@ -224,16 +227,16 @@ module Ferret::Index
|
|
224
227
|
# compound stream. After successful merge, the source files
|
225
228
|
# are deleted.
|
226
229
|
#
|
227
|
-
# Throws::
|
230
|
+
# Throws:: StateException if close() had been called before or
|
228
231
|
# if no file has been added to this object
|
229
232
|
def close()
|
230
233
|
|
231
234
|
if @merged
|
232
|
-
raise(
|
235
|
+
raise(StateException, "Merge already performed")
|
233
236
|
end
|
234
237
|
|
235
238
|
if @file_entries.empty?
|
236
|
-
raise(
|
239
|
+
raise(StateException, "No entries to merge have been defined")
|
237
240
|
end
|
238
241
|
|
239
242
|
@merged = true
|
@@ -27,12 +27,6 @@ module Ferret
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
# Returns the number of fields that have been added to this field infos
|
31
|
-
# object.
|
32
|
-
def size
|
33
|
-
return @fi_array.size
|
34
|
-
end
|
35
|
-
|
36
30
|
# Automatically adds all of the fields from the document if they haven't
|
37
31
|
# been added already. Or it will update the values.
|
38
32
|
def add_doc_fields(doc)
|
data/lib/ferret/index/index.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
+
require 'monitor'
|
2
|
+
|
1
3
|
module Ferret::Index
|
2
4
|
# This is a simplified interface to the index. See the TUTORIAL for more
|
3
5
|
# information on how to use this class.
|
4
6
|
class Index
|
7
|
+
include MonitorMixin
|
8
|
+
|
5
9
|
include Ferret::Store
|
6
10
|
include Ferret::Search
|
7
11
|
include Ferret::Document
|
@@ -77,9 +81,10 @@ module Ferret::Index
|
|
77
81
|
# :default_slop => 2)
|
78
82
|
#
|
79
83
|
def initialize(options = {})
|
84
|
+
super()
|
85
|
+
options[:create_if_missing] = true if options[:create_if_missing].nil?
|
80
86
|
if options[:path]
|
81
|
-
options[:
|
82
|
-
@dir = FSDirectory.new(options[:path], true)
|
87
|
+
@dir = FSDirectory.new(options[:path], options[:create])
|
83
88
|
options[:close_dir] = true
|
84
89
|
elsif options[:dir]
|
85
90
|
@dir = options[:dir]
|
@@ -88,29 +93,34 @@ module Ferret::Index
|
|
88
93
|
@dir = RAMDirectory.new
|
89
94
|
end
|
90
95
|
|
91
|
-
@
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
96
|
+
@dir.synchronize do
|
97
|
+
@options = options
|
98
|
+
@writer = IndexWriter.new(@dir, options)
|
99
|
+
options[:analyzer] = @analyzer = @writer.analyzer
|
100
|
+
@has_writes = false
|
101
|
+
@reader = nil
|
102
|
+
@options.delete(:create) # only want to create the first time if at all
|
103
|
+
@close_dir = @options.delete(:close_dir) || false # we'll hold this here
|
104
|
+
@default_search_field = (@options[:default_search_field] || \
|
105
|
+
@options[:default_field] || "*")
|
106
|
+
@default_field = @options[:default_field] || ""
|
107
|
+
@open = true
|
108
|
+
@qp = nil
|
109
|
+
end
|
102
110
|
end
|
103
111
|
|
104
112
|
# Closes this index by closing its associated reader and writer objects.
|
105
113
|
def close
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
114
|
+
@dir.synchronize do
|
115
|
+
if not @open
|
116
|
+
raise "tried to close an already closed directory"
|
117
|
+
end
|
118
|
+
@reader.close() if @reader
|
119
|
+
@writer.close() if @writer
|
120
|
+
@dir.close()
|
112
121
|
|
113
|
-
|
122
|
+
@open = false
|
123
|
+
end
|
114
124
|
end
|
115
125
|
|
116
126
|
# Get the reader for this index.
|
@@ -133,6 +143,7 @@ module Ferret::Index
|
|
133
143
|
ensure_writer_open()
|
134
144
|
return @writer
|
135
145
|
end
|
146
|
+
protected :reader, :writer, :searcher
|
136
147
|
|
137
148
|
# Adds a document to this index, using the provided analyzer instead of
|
138
149
|
# the local analyzer if provided. If the document contains more than
|
@@ -147,27 +158,28 @@ module Ferret::Index
|
|
147
158
|
# index << "This is a new document to be indexed"
|
148
159
|
# index << ["And here", "is another", "new document", "to be indexed"]
|
149
160
|
#
|
150
|
-
# But these are pretty simple documents. If this is all you want to index
|
151
|
-
# could probably just use SimpleSearch. So let's give our documents
|
161
|
+
# But these are pretty simple documents. If this is all you want to index
|
162
|
+
# you could probably just use SimpleSearch. So let's give our documents
|
163
|
+
# some fields;
|
152
164
|
#
|
153
165
|
# index << {:title => "Programming Ruby", :content => "blah blah blah"}
|
154
166
|
# index << {:title => "Programming Ruby", :content => "yada yada yada"}
|
155
167
|
#
|
156
|
-
# Or if you are indexing data stored in a database, you'll probably want
|
157
|
-
# store the id;
|
168
|
+
# Or if you are indexing data stored in a database, you'll probably want
|
169
|
+
# to store the id;
|
158
170
|
#
|
159
171
|
# index << {:id => row.id, :title => row.title, :date => row.date}
|
160
172
|
#
|
161
|
-
# The methods above while store all of the input data as well tokenizing
|
162
|
-
# indexing it. Sometimes we won't want to tokenize (divide the string
|
163
|
-
# tokens) the data. For example, we might want to leave the title as
|
164
|
-
# string and only allow searchs for that complete string.
|
165
|
-
# want to store the data as it's already stored in the
|
166
|
-
# waste to store it in the index. Or perhaps we are
|
167
|
-
# using Ferret to store all of our data, in
|
168
|
-
# index it. For example, if we are storing
|
169
|
-
# index them. All of this can be
|
170
|
-
# eg;
|
173
|
+
# The methods above while store all of the input data as well tokenizing
|
174
|
+
# and indexing it. Sometimes we won't want to tokenize (divide the string
|
175
|
+
# into tokens) the data. For example, we might want to leave the title as
|
176
|
+
# a complete string and only allow searchs for that complete string.
|
177
|
+
# Sometimes we won't want to store the data as it's already stored in the
|
178
|
+
# database so it'll be a waste to store it in the index. Or perhaps we are
|
179
|
+
# doing without a database and using Ferret to store all of our data, in
|
180
|
+
# which case we might not want to index it. For example, if we are storing
|
181
|
+
# images in the index, we won't want to index them. All of this can be
|
182
|
+
# done using Ferret's Ferret::Document module. eg;
|
171
183
|
#
|
172
184
|
# include Ferret::Document
|
173
185
|
# doc = Document.new
|
@@ -177,35 +189,37 @@ module Ferret::Index
|
|
177
189
|
# doc << Field.new("image", row.image, Field::Store::YES, Field::Index::NO)
|
178
190
|
# index << doc
|
179
191
|
#
|
180
|
-
# You can also compress the data that you are storing or store term
|
181
|
-
# the data. Read more about this in Ferret::Document::Field.
|
192
|
+
# You can also compress the data that you are storing or store term
|
193
|
+
# vectors with the data. Read more about this in Ferret::Document::Field.
|
182
194
|
def add_document(doc, analyzer = nil)
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
elsif doc.is_a?(Array)
|
190
|
-
fdoc = Document.new
|
191
|
-
doc.each() do |field|
|
192
|
-
fdoc << Field.new(@default_field, field,
|
195
|
+
@dir.synchronize do
|
196
|
+
ensure_writer_open()
|
197
|
+
fdoc = nil
|
198
|
+
if doc.is_a?(String)
|
199
|
+
fdoc = Document.new
|
200
|
+
fdoc << Field.new(@default_field, doc,
|
193
201
|
Field::Store::YES, Field::Index::TOKENIZED)
|
202
|
+
elsif doc.is_a?(Array)
|
203
|
+
fdoc = Document.new
|
204
|
+
doc.each() do |field|
|
205
|
+
fdoc << Field.new(@default_field, field,
|
206
|
+
Field::Store::YES, Field::Index::TOKENIZED)
|
207
|
+
end
|
208
|
+
elsif doc.is_a?(Hash)
|
209
|
+
fdoc = Document.new
|
210
|
+
doc.each_pair() do |field, text|
|
211
|
+
fdoc << Field.new(field.to_s, text.to_s,
|
212
|
+
Field::Store::YES, Field::Index::TOKENIZED)
|
213
|
+
end
|
214
|
+
elsif doc.is_a?(Document)
|
215
|
+
fdoc = doc
|
216
|
+
else
|
217
|
+
raise ArgumentError, "Unknown document type #{doc.class}"
|
194
218
|
end
|
195
|
-
|
196
|
-
fdoc = Document.new
|
197
|
-
doc.each_pair() do |field, text|
|
198
|
-
fdoc << Field.new(field.to_s, text.to_s,
|
199
|
-
Field::Store::YES, Field::Index::TOKENIZED)
|
200
|
-
end
|
201
|
-
elsif doc.is_a?(Document)
|
202
|
-
fdoc = doc
|
203
|
-
else
|
204
|
-
raise ArgumentError, "Unknown document type #{doc.class}"
|
205
|
-
end
|
206
|
-
@has_writes = true
|
219
|
+
@has_writes = true
|
207
220
|
|
208
|
-
|
221
|
+
@writer.add_document(fdoc, analyzer || @writer.analyzer)
|
222
|
+
end
|
209
223
|
end
|
210
224
|
alias :<< :add_document
|
211
225
|
|
@@ -213,24 +227,16 @@ module Ferret::Index
|
|
213
227
|
# pass to this method. You can also pass a hash with one or more of the
|
214
228
|
# following; {filter, num_docs, first_doc, sort}
|
215
229
|
#
|
216
|
-
# query::
|
217
|
-
# filter::
|
218
|
-
# first_doc::
|
219
|
-
#
|
220
|
-
# num_docs::
|
221
|
-
# sort::
|
230
|
+
# query:: The query to run on the index
|
231
|
+
# filter:: Filters docs from the search result
|
232
|
+
# first_doc:: The index in the results of the first doc retrieved.
|
233
|
+
# Default is 0
|
234
|
+
# num_docs:: The number of results returned. Default is 10
|
235
|
+
# sort:: An array of SortFields describing how to sort the results.
|
222
236
|
def search(query, options = {})
|
223
|
-
|
224
|
-
|
225
|
-
if @qp.nil?
|
226
|
-
@qp = Ferret::QueryParser.new(@default_search_field, @options)
|
227
|
-
end
|
228
|
-
# we need to set this ever time, in case a new field has been added
|
229
|
-
@qp.fields = @reader.get_field_names.to_a
|
230
|
-
query = @qp.parse(query)
|
237
|
+
@dir.synchronize do
|
238
|
+
return do_search(query, options)
|
231
239
|
end
|
232
|
-
|
233
|
-
return @searcher.search(query, options)
|
234
240
|
end
|
235
241
|
|
236
242
|
# See Index#search
|
@@ -241,9 +247,14 @@ module Ferret::Index
|
|
241
247
|
# puts "hit document number #{doc} with a score of #{score}"
|
242
248
|
# end
|
243
249
|
#
|
250
|
+
# returns:: The total number of hits.
|
244
251
|
def search_each(query, options = {}) # :yield: doc, score
|
245
|
-
|
246
|
-
|
252
|
+
@dir.synchronize do
|
253
|
+
hits = do_search(query, options)
|
254
|
+
hits.score_docs.each do |score_doc|
|
255
|
+
yield score_doc.doc, score_doc.score
|
256
|
+
end
|
257
|
+
return hits.total_hits
|
247
258
|
end
|
248
259
|
end
|
249
260
|
|
@@ -253,14 +264,16 @@ module Ferret::Index
|
|
253
264
|
# id:: The number of the document to retrieve, or the term used as the id
|
254
265
|
# for the document we wish to retrieve
|
255
266
|
def doc(id)
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
267
|
+
@dir.synchronize do
|
268
|
+
ensure_reader_open()
|
269
|
+
if id.is_a?(String)
|
270
|
+
t = Term.new("id", id.to_s)
|
271
|
+
return @reader.get_document_with_term(t)
|
272
|
+
elsif id.is_a?(Term)
|
273
|
+
return @reader.get_document_with_term(id)
|
274
|
+
else
|
275
|
+
return @reader.get_document(id)
|
276
|
+
end
|
264
277
|
end
|
265
278
|
end
|
266
279
|
alias :[] :doc
|
@@ -271,28 +284,34 @@ module Ferret::Index
|
|
271
284
|
#
|
272
285
|
# id:: The number of the document to delete
|
273
286
|
def delete(id)
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
287
|
+
@dir.synchronize do
|
288
|
+
ensure_reader_open()
|
289
|
+
if id.is_a?(String)
|
290
|
+
t = Term.new("id", id.to_s)
|
291
|
+
return @reader.delete_docs_with_term(t)
|
292
|
+
elsif id.is_a?(Term)
|
293
|
+
return @reader.delete_docs_with_term(id)
|
294
|
+
else
|
295
|
+
return @reader.delete(id)
|
296
|
+
end
|
282
297
|
end
|
283
298
|
end
|
284
299
|
|
285
300
|
# Returns true if document +n+ has been deleted
|
286
301
|
def deleted?(n)
|
287
|
-
|
288
|
-
|
302
|
+
@dir.synchronize do
|
303
|
+
ensure_reader_open()
|
304
|
+
return @reader.deleted?(n)
|
305
|
+
end
|
289
306
|
end
|
290
307
|
|
291
308
|
# Returns true if any documents have been deleted since the index was last
|
292
309
|
# flushed.
|
293
310
|
def has_deletions?()
|
294
|
-
|
295
|
-
|
311
|
+
@dir.synchronize do
|
312
|
+
ensure_reader_open()
|
313
|
+
return @reader.has_deletions?
|
314
|
+
end
|
296
315
|
end
|
297
316
|
|
298
317
|
# Returns true if any documents have been added to the index since the
|
@@ -301,18 +320,102 @@ module Ferret::Index
|
|
301
320
|
return @has_writes
|
302
321
|
end
|
303
322
|
|
323
|
+
# Flushes all writes to the index. This will not optimize the index but it
|
324
|
+
# will make sure that all writes are written to it.
|
325
|
+
#
|
326
|
+
# NOTE: this is not necessary if you are only using this class. All writes
|
327
|
+
# will automatically flush when you perform an operation that reads the
|
328
|
+
# index.
|
329
|
+
def flush()
|
330
|
+
@dir.synchronize do
|
331
|
+
@reader.close if @reader
|
332
|
+
@writer.close if @writer
|
333
|
+
@reader = nil
|
334
|
+
@writer = nil
|
335
|
+
@searcher = nil
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
304
339
|
# optimizes the index. This should only be called when the index will no
|
305
340
|
# longer be updated very often, but will be read a lot.
|
306
341
|
def optimize()
|
307
|
-
|
308
|
-
|
309
|
-
|
342
|
+
@dir.synchronize do
|
343
|
+
ensure_writer_open()
|
344
|
+
@writer.optimize()
|
345
|
+
@modified = true
|
346
|
+
end
|
310
347
|
end
|
311
348
|
|
312
349
|
# returns the number of documents in the index
|
313
350
|
def size()
|
314
|
-
|
315
|
-
|
351
|
+
@dir.synchronize do
|
352
|
+
ensure_reader_open()
|
353
|
+
return @reader.num_docs()
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
# Merges all segments from an index or an array of indexes into this
|
358
|
+
# index. You can pass a single Index::Index, Index::Reader,
|
359
|
+
# Store::Directory or an array of any single one of these.
|
360
|
+
#
|
361
|
+
# This may be used to parallelize batch indexing. A large document
|
362
|
+
# collection can be broken into sub-collections. Each sub-collection can
|
363
|
+
# be indexed in parallel, on a different thread, process or machine and
|
364
|
+
# perhaps all in memory. The complete index can then be created by
|
365
|
+
# merging sub-collection indexes with this method.
|
366
|
+
#
|
367
|
+
# After this completes, the index is optimized.
|
368
|
+
def add_indexes(indexes)
|
369
|
+
@dir.synchronize do
|
370
|
+
indexes = [indexes].flatten # make sure we have an array
|
371
|
+
return if indexes.size == 0 # nothing to do
|
372
|
+
if indexes[0].is_a?(Index)
|
373
|
+
readers = indexes.map {|index| index.reader }
|
374
|
+
indexes = readers
|
375
|
+
end
|
376
|
+
|
377
|
+
if indexes[0].is_a?(IndexReader)
|
378
|
+
ensure_reader_open
|
379
|
+
indexes.delete(@reader) # we don't want to merge with self
|
380
|
+
ensure_writer_open
|
381
|
+
@writer.add_indexes_readers(indexes)
|
382
|
+
elsif indexes[0].is_a?(Ferret::Store::Directory)
|
383
|
+
indexes.delete(@dir) # we don't want to merge with self
|
384
|
+
ensure_writer_open
|
385
|
+
@writer.add_indexes(indexes)
|
386
|
+
else
|
387
|
+
raise ArgumentError, "Unknown index type when trying to merge indexes"
|
388
|
+
end
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
# This is a simple utility method for saving an in memory or RAM index to
|
393
|
+
# the file system. The same thing can be achieved by using the
|
394
|
+
# Index::Index#add_indexes method and you will have more options when
|
395
|
+
# creating the new index, however this is a simple way to turn a RAM index
|
396
|
+
# into a file system index.
|
397
|
+
#
|
398
|
+
# directory:: This can either be a Store::Directory object or a string
|
399
|
+
# representing the path to the directory where you would
|
400
|
+
# like to store the the index.
|
401
|
+
#
|
402
|
+
# create:: True if you'd like to create the directory if it doesn't
|
403
|
+
# exist or copy over an existing directory. False if you'd
|
404
|
+
# like to merge with the existing directory. This defaults to
|
405
|
+
# false.
|
406
|
+
def persist(directory, create = true)
|
407
|
+
synchronize do
|
408
|
+
flush
|
409
|
+
old_dir = @dir
|
410
|
+
if directory.is_a?(String)
|
411
|
+
@dir = FSDirectory.new(directory, create)
|
412
|
+
@options[:close_dir] = true
|
413
|
+
elsif directory.is_a?(Ferret::Store::Directory)
|
414
|
+
@dir = directory
|
415
|
+
end
|
416
|
+
ensure_writer_open
|
417
|
+
@writer.add_indexes([old_dir])
|
418
|
+
end
|
316
419
|
end
|
317
420
|
|
318
421
|
protected
|
@@ -343,5 +446,20 @@ module Ferret::Index
|
|
343
446
|
ensure_reader_open()
|
344
447
|
@searcher = IndexSearcher.new(@reader)
|
345
448
|
end
|
449
|
+
|
450
|
+
private
|
451
|
+
def do_search(query, options)
|
452
|
+
ensure_searcher_open()
|
453
|
+
if query.is_a?(String)
|
454
|
+
if @qp.nil?
|
455
|
+
@qp = Ferret::QueryParser.new(@default_search_field, @options)
|
456
|
+
end
|
457
|
+
# we need to set this ever time, in case a new field has been added
|
458
|
+
@qp.fields = @reader.get_field_names.to_a
|
459
|
+
query = @qp.parse(query)
|
460
|
+
end
|
461
|
+
|
462
|
+
return @searcher.search(query, options)
|
463
|
+
end
|
346
464
|
end
|
347
465
|
end
|