ferret 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/Rakefile +1 -1
  2. data/TODO +3 -0
  3. data/ext/dummy.exe +0 -0
  4. data/lib/ferret.rb +1 -1
  5. data/lib/ferret/analysis/token.rb +6 -0
  6. data/lib/ferret/analysis/tokenizers.rb +5 -5
  7. data/lib/ferret/document/document.rb +10 -13
  8. data/lib/ferret/index/compound_file_io.rb +12 -9
  9. data/lib/ferret/index/field_infos.rb +0 -6
  10. data/lib/ferret/index/index.rb +220 -102
  11. data/lib/ferret/index/index_reader.rb +22 -2
  12. data/lib/ferret/index/index_writer.rb +55 -14
  13. data/lib/ferret/index/multi_reader.rb +279 -279
  14. data/lib/ferret/index/segment_infos.rb +3 -3
  15. data/lib/ferret/index/segment_merger.rb +7 -6
  16. data/lib/ferret/index/segment_reader.rb +23 -7
  17. data/lib/ferret/index/segment_term_enum.rb +6 -7
  18. data/lib/ferret/index/term_buffer.rb +3 -5
  19. data/lib/ferret/index/term_doc_enum.rb +7 -2
  20. data/lib/ferret/index/term_infos_io.rb +15 -8
  21. data/lib/ferret/query_parser/query_parser.tab.rb +49 -45
  22. data/lib/ferret/search/boolean_query.rb +3 -4
  23. data/lib/ferret/search/boolean_scorer.rb +11 -11
  24. data/lib/ferret/search/caching_wrapper_filter.rb +1 -1
  25. data/lib/ferret/search/disjunction_sum_scorer.rb +9 -7
  26. data/lib/ferret/search/field_cache.rb +1 -2
  27. data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
  28. data/lib/ferret/search/fuzzy_term_enum.rb +64 -58
  29. data/lib/ferret/search/index_searcher.rb +16 -9
  30. data/lib/ferret/search/prefix_query.rb +7 -0
  31. data/lib/ferret/search/query_filter.rb +1 -1
  32. data/lib/ferret/search/term_scorer.rb +5 -1
  33. data/lib/ferret/search/top_docs.rb +12 -0
  34. data/lib/ferret/store/buffered_index_io.rb +5 -6
  35. data/lib/ferret/store/fs_store.rb +47 -33
  36. data/lib/ferret/store/ram_store.rb +2 -2
  37. data/lib/ferret/utils.rb +1 -0
  38. data/lib/ferret/utils/bit_vector.rb +20 -2
  39. data/lib/ferret/utils/thread_local.rb +28 -0
  40. data/lib/ferret/utils/weak_key_hash.rb +11 -2
  41. data/test/benchmark/tb_rw_vint.rb +1 -1
  42. data/test/functional/thread_safety_index_test.rb +81 -0
  43. data/test/functional/thread_safety_test.rb +137 -0
  44. data/test/test_all.rb +3 -7
  45. data/test/test_helper.rb +2 -1
  46. data/test/unit/index/tc_compound_file_io.rb +2 -2
  47. data/test/unit/index/tc_index.rb +128 -6
  48. data/test/unit/index/tc_index_reader.rb +1 -1
  49. data/test/unit/index/tc_segment_infos.rb +1 -1
  50. data/test/unit/index/th_doc.rb +1 -1
  51. data/test/unit/search/tc_index_searcher.rb +6 -0
  52. data/test/unit/store/tc_fs_store.rb +3 -3
  53. data/test/unit/utils/tc_bit_vector.rb +8 -0
  54. data/test/unit/utils/tc_thread.rb +61 -0
  55. data/test/unit/utils/tc_weak_key_hash.rb +2 -2
  56. data/test/utils/number_to_spoken.rb +132 -0
  57. metadata +7 -2
data/Rakefile CHANGED
@@ -196,7 +196,7 @@ end
196
196
 
197
197
  desc "Make a new release"
198
198
  task :prerelease => [:clobber, :all_tests, :parsers]
199
- #task :package => [:prerelease]
199
+ task :package => [:prerelease]
200
200
  task :tag => [:prerelease]
201
201
  task :update_version => [:prerelease]
202
202
  task :release => [:tag, :update_version, :package] do
data/TODO CHANGED
@@ -5,8 +5,11 @@ Send suggestions for this list to mailto:dbalmain@gmail.com
5
5
  === To Do
6
6
 
7
7
  * Add the ability to persist an in memory index to Ferret::Index::Index
8
+ * Make a dll for people on Windows
8
9
 
9
10
  === Done
10
11
 
11
12
  * Add UTF-8 support
12
13
  * Multi Field Query
14
+ * Test threading
15
+ * Compile a proper dummy executable
data/ext/dummy.exe CHANGED
Binary file
data/lib/ferret.rb CHANGED
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.1.3'
25
+ VERSION = '0.1.4'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -35,6 +35,12 @@ module Ferret::Analysis
35
35
  @position_increment = pos_inc
36
36
  end
37
37
 
38
+ def eql?(o)
39
+ return (o.instance_of?(Token) and @start_offset == o.start_offset and
40
+ @end_offset == o.end_offset and @term_text = o.term_text)
41
+ end
42
+ alias :== :eql?
43
+
38
44
  # Tokens are sorted by the position in the text at which they occur, ie
39
45
  # the start_offset. If two tokens have the same start offset, (see
40
46
  # position_increment=) then, they are sorted by the end_offset and then
@@ -26,7 +26,7 @@ module Ferret::Analysis
26
26
  #
27
27
  # class LetterTokenizer < RegExpTokenizer
28
28
  # def token_re()
29
- # /[a-zA-Z]+/
29
+ # /[[:alpha:]]+/
30
30
  # end
31
31
  # end
32
32
  class RegExpTokenizer < Tokenizer
@@ -63,7 +63,7 @@ module Ferret::Analysis
63
63
  protected
64
64
  # returns the regular expression used to find the next token
65
65
  def token_re
66
- /[a-zA-Z]+/
66
+ /[[:alpha:]]+/
67
67
  end
68
68
 
69
69
  # Called on each token to normalize it before it is added to the
@@ -75,13 +75,13 @@ module Ferret::Analysis
75
75
 
76
76
  # A LetterTokenizer is a tokenizer that divides text at non-letters.
77
77
  # That's to say, it defines tokens as maximal strings of adjacent letters,
78
- # as defined by the regular expression _/[a-zA-Z]+/_.
78
+ # as defined by the regular expression _/[[:alpha:]]+/_.
79
79
  class LetterTokenizer < RegExpTokenizer
80
80
  protected
81
81
  # Collects only characters which satisfy the regular expression
82
- # _/[a-zA-Z]+/_.
82
+ # _/[[:alpha:]]+/_.
83
83
  def token_re()
84
- /[a-zA-Z]+/
84
+ /[[:alpha:]]+/
85
85
  end
86
86
  end
87
87
 
@@ -69,13 +69,13 @@ module Ferret::Document
69
69
  # document has to be deleted from an index and a new changed version of
70
70
  # that document has to be added.
71
71
  def add_field(field)
72
- (@fields[field.name] ||= []) << field
72
+ (@fields[field.name.to_s] ||= []) << field
73
73
  end
74
74
  alias :<< :add_field
75
75
 
76
76
  # Removes the first field of this name if it exists.
77
77
  def remove_field(name)
78
- @fields[name].delete_at(0)
78
+ @fields[name.to_s].delete_at(0)
79
79
  end
80
80
 
81
81
  # Removes all fields with the given name from the document.
@@ -89,7 +89,7 @@ module Ferret::Document
89
89
  # this, a document has to be deleted from an index and a new changed
90
90
  # version of that document has to be added.
91
91
  def remove_fields(name)
92
- @fields.delete(name)
92
+ @fields.delete(name.to_s)
93
93
  end
94
94
 
95
95
  # Returns the first field with the given name.
@@ -98,7 +98,7 @@ module Ferret::Document
98
98
  # name:: the name of the field
99
99
  # Return:: a _Field_ array
100
100
  def field(name)
101
- @fields[name] ? @fields[name][0] : nil
101
+ @fields[name.to_s] ? @fields[name.to_s][0] : nil
102
102
  end
103
103
 
104
104
  # Returns an array of all fields with the given name.
@@ -107,7 +107,7 @@ module Ferret::Document
107
107
  # name:: the name of the field
108
108
  # Return:: a _Field_ array
109
109
  def fields(name)
110
- @fields[name]
110
+ @fields[name.to_s]
111
111
  end
112
112
 
113
113
  # Returns an array of values of the field specified as the method
@@ -116,8 +116,8 @@ module Ferret::Document
116
116
  # name:: the name of the field
117
117
  # Return:: a _String_ of field values
118
118
  def values(name)
119
- return nil if @fields[name].nil?
120
- @fields[name].map {|f| f.data if not f.binary? }.join(" ")
119
+ return nil if @fields[name.to_s].nil?
120
+ @fields[name.to_s].map {|f| f.data if not f.binary? }.join(" ")
121
121
  end
122
122
  alias :[] :values
123
123
 
@@ -125,7 +125,7 @@ module Ferret::Document
125
125
  # field of that name then it will set the data in the first field of that
126
126
  # name.
127
127
  def []=(field_name, data)
128
- field = field(field_name)
128
+ field = field(field_name.to_s)
129
129
  raise ArgumentError, "Field does not exist" unless field
130
130
  field.data = data
131
131
  end
@@ -137,16 +137,13 @@ module Ferret::Document
137
137
  # Return:: a _String_ of field values
138
138
  def binaries(name)
139
139
  binaries = []
140
- @fields[name].each {|f| binaries << f.data if f.binary? }
140
+ @fields[name.to_s].each {|f| binaries << f.data if f.binary? }
141
141
  return binaries
142
142
  end
143
143
 
144
144
  # Prints the fields of a document for human consumption.#/
145
145
  def to_s()
146
- field_str = ""
147
- @fields.each_key { |name| field_str += name + " " }
148
- field_str[-1] = ">"
149
- return "Document<" + field_str
146
+ return "Document<#{@fields.keys.join(" ")}>"
150
147
  end
151
148
  end
152
149
  end
@@ -92,7 +92,7 @@ module Ferret::Index
92
92
  end
93
93
 
94
94
  # Returns true iff a file with the given name exists.
95
- def file_exists(name)
95
+ def exists?(name)
96
96
  return @entries.key?(name)
97
97
  end
98
98
 
@@ -113,7 +113,7 @@ module Ferret::Index
113
113
  def rename(from, to) raise(UnsupportedOperationError) end
114
114
 
115
115
  # Returns the length of a file in the directory.
116
- def file_length(name)
116
+ def length(name)
117
117
  e = @entries[name]
118
118
  if (e == nil): raise(IOError, "File " + name + " does not exist") end
119
119
  return e.length
@@ -188,6 +188,9 @@ module Ferret::Index
188
188
  # data section, and a UTF String with that file's extension.
189
189
  class CompoundFileWriter
190
190
 
191
+ class StateError < Exception
192
+ end
193
+
191
194
  attr_reader :directory, :file_name
192
195
 
193
196
  # Create the compound stream in the specified file. The file name is the
@@ -203,16 +206,16 @@ module Ferret::Index
203
206
  # Add a source stream. _file_name_ is the string by which the
204
207
  # sub-stream will be known in the compound stream.
205
208
  #
206
- # Throws:: IllegalStateError if this writer is closed
207
- # Throws:: IllegalArgumentError if a file with the same name
209
+ # Throws:: StateError if this writer is closed
210
+ # Throws:: ArgumentError if a file with the same name
208
211
  # has been added already
209
212
  def add_file(file_name)
210
213
  if @merged
211
- raise(IllegalStateError, "Can't add extensions after merge has been called")
214
+ raise(StateError, "Can't add extensions after merge has been called")
212
215
  end
213
216
 
214
217
  if not @ids.add?(file_name)
215
- raise(IllegalArgumentError, "File " + file + " already added")
218
+ raise(ArgumentError, "File #{file_name} already added")
216
219
  end
217
220
 
218
221
  entry = FileEntry.new(file_name)
@@ -224,16 +227,16 @@ module Ferret::Index
224
227
  # compound stream. After successful merge, the source files
225
228
  # are deleted.
226
229
  #
227
- # Throws:: IllegalStateException if close() had been called before or
230
+ # Throws:: StateException if close() had been called before or
228
231
  # if no file has been added to this object
229
232
  def close()
230
233
 
231
234
  if @merged
232
- raise(IllegalStateException, "Merge already performed")
235
+ raise(StateException, "Merge already performed")
233
236
  end
234
237
 
235
238
  if @file_entries.empty?
236
- raise(IllegalStateException, "No entries to merge have been defined")
239
+ raise(StateException, "No entries to merge have been defined")
237
240
  end
238
241
 
239
242
  @merged = true
@@ -27,12 +27,6 @@ module Ferret
27
27
  end
28
28
  end
29
29
 
30
- # Returns the number of fields that have been added to this field infos
31
- # object.
32
- def size
33
- return @fi_array.size
34
- end
35
-
36
30
  # Automatically adds all of the fields from the document if they haven't
37
31
  # been added already. Or it will update the values.
38
32
  def add_doc_fields(doc)
@@ -1,7 +1,11 @@
1
+ require 'monitor'
2
+
1
3
  module Ferret::Index
2
4
  # This is a simplified interface to the index. See the TUTORIAL for more
3
5
  # information on how to use this class.
4
6
  class Index
7
+ include MonitorMixin
8
+
5
9
  include Ferret::Store
6
10
  include Ferret::Search
7
11
  include Ferret::Document
@@ -77,9 +81,10 @@ module Ferret::Index
77
81
  # :default_slop => 2)
78
82
  #
79
83
  def initialize(options = {})
84
+ super()
85
+ options[:create_if_missing] = true if options[:create_if_missing].nil?
80
86
  if options[:path]
81
- options[:create_if_missing] = true if options[:create_if_missing].nil?
82
- @dir = FSDirectory.new(options[:path], true)
87
+ @dir = FSDirectory.new(options[:path], options[:create])
83
88
  options[:close_dir] = true
84
89
  elsif options[:dir]
85
90
  @dir = options[:dir]
@@ -88,29 +93,34 @@ module Ferret::Index
88
93
  @dir = RAMDirectory.new
89
94
  end
90
95
 
91
- @options = options
92
- @writer = IndexWriter.new(@dir, options)
93
- options[:analyzer] = @analyzer = @writer.analyzer
94
- @has_writes = false
95
- @reader = nil
96
- @options.delete(:create) # only want to create the first time if at all
97
- @close_dir = @options.delete(:close_dir) || false # we'll hold this here
98
- @default_search_field = (@options[:default_search_field] || \
99
- @options[:default_field] || "*")
100
- @default_field = @options[:default_field] || ""
101
- @open = true
96
+ @dir.synchronize do
97
+ @options = options
98
+ @writer = IndexWriter.new(@dir, options)
99
+ options[:analyzer] = @analyzer = @writer.analyzer
100
+ @has_writes = false
101
+ @reader = nil
102
+ @options.delete(:create) # only want to create the first time if at all
103
+ @close_dir = @options.delete(:close_dir) || false # we'll hold this here
104
+ @default_search_field = (@options[:default_search_field] || \
105
+ @options[:default_field] || "*")
106
+ @default_field = @options[:default_field] || ""
107
+ @open = true
108
+ @qp = nil
109
+ end
102
110
  end
103
111
 
104
112
  # Closes this index by closing its associated reader and writer objects.
105
113
  def close
106
- if not @open
107
- raise "tried to close an already closed directory"
108
- end
109
- @reader.close() if @reader
110
- @writer.close() if @writer
111
- @dir.close()
114
+ @dir.synchronize do
115
+ if not @open
116
+ raise "tried to close an already closed directory"
117
+ end
118
+ @reader.close() if @reader
119
+ @writer.close() if @writer
120
+ @dir.close()
112
121
 
113
- @open = false
122
+ @open = false
123
+ end
114
124
  end
115
125
 
116
126
  # Get the reader for this index.
@@ -133,6 +143,7 @@ module Ferret::Index
133
143
  ensure_writer_open()
134
144
  return @writer
135
145
  end
146
+ protected :reader, :writer, :searcher
136
147
 
137
148
  # Adds a document to this index, using the provided analyzer instead of
138
149
  # the local analyzer if provided. If the document contains more than
@@ -147,27 +158,28 @@ module Ferret::Index
147
158
  # index << "This is a new document to be indexed"
148
159
  # index << ["And here", "is another", "new document", "to be indexed"]
149
160
  #
150
- # But these are pretty simple documents. If this is all you want to index you
151
- # could probably just use SimpleSearch. So let's give our documents some fields;
161
+ # But these are pretty simple documents. If this is all you want to index
162
+ # you could probably just use SimpleSearch. So let's give our documents
163
+ # some fields;
152
164
  #
153
165
  # index << {:title => "Programming Ruby", :content => "blah blah blah"}
154
166
  # index << {:title => "Programming Ruby", :content => "yada yada yada"}
155
167
  #
156
- # Or if you are indexing data stored in a database, you'll probably want to
157
- # store the id;
168
+ # Or if you are indexing data stored in a database, you'll probably want
169
+ # to store the id;
158
170
  #
159
171
  # index << {:id => row.id, :title => row.title, :date => row.date}
160
172
  #
161
- # The methods above while store all of the input data as well tokenizing and
162
- # indexing it. Sometimes we won't want to tokenize (divide the string into
163
- # tokens) the data. For example, we might want to leave the title as a complete
164
- # string and only allow searchs for that complete string. Sometimes we won't
165
- # want to store the data as it's already stored in the database so it'll be a
166
- # waste to store it in the index. Or perhaps we are doing without a database and
167
- # using Ferret to store all of our data, in which case we might not want to
168
- # index it. For example, if we are storing images in the index, we won't want to
169
- # index them. All of this can be done using Ferret's Ferret::Document module.
170
- # eg;
173
+ # The methods above while store all of the input data as well tokenizing
174
+ # and indexing it. Sometimes we won't want to tokenize (divide the string
175
+ # into tokens) the data. For example, we might want to leave the title as
176
+ # a complete string and only allow searchs for that complete string.
177
+ # Sometimes we won't want to store the data as it's already stored in the
178
+ # database so it'll be a waste to store it in the index. Or perhaps we are
179
+ # doing without a database and using Ferret to store all of our data, in
180
+ # which case we might not want to index it. For example, if we are storing
181
+ # images in the index, we won't want to index them. All of this can be
182
+ # done using Ferret's Ferret::Document module. eg;
171
183
  #
172
184
  # include Ferret::Document
173
185
  # doc = Document.new
@@ -177,35 +189,37 @@ module Ferret::Index
177
189
  # doc << Field.new("image", row.image, Field::Store::YES, Field::Index::NO)
178
190
  # index << doc
179
191
  #
180
- # You can also compress the data that you are storing or store term vectors with
181
- # the data. Read more about this in Ferret::Document::Field.
192
+ # You can also compress the data that you are storing or store term
193
+ # vectors with the data. Read more about this in Ferret::Document::Field.
182
194
  def add_document(doc, analyzer = nil)
183
- ensure_writer_open()
184
- fdoc = nil
185
- if doc.is_a?(String)
186
- fdoc = Document.new
187
- fdoc << Field.new(@default_field, doc,
188
- Field::Store::YES, Field::Index::TOKENIZED)
189
- elsif doc.is_a?(Array)
190
- fdoc = Document.new
191
- doc.each() do |field|
192
- fdoc << Field.new(@default_field, field,
195
+ @dir.synchronize do
196
+ ensure_writer_open()
197
+ fdoc = nil
198
+ if doc.is_a?(String)
199
+ fdoc = Document.new
200
+ fdoc << Field.new(@default_field, doc,
193
201
  Field::Store::YES, Field::Index::TOKENIZED)
202
+ elsif doc.is_a?(Array)
203
+ fdoc = Document.new
204
+ doc.each() do |field|
205
+ fdoc << Field.new(@default_field, field,
206
+ Field::Store::YES, Field::Index::TOKENIZED)
207
+ end
208
+ elsif doc.is_a?(Hash)
209
+ fdoc = Document.new
210
+ doc.each_pair() do |field, text|
211
+ fdoc << Field.new(field.to_s, text.to_s,
212
+ Field::Store::YES, Field::Index::TOKENIZED)
213
+ end
214
+ elsif doc.is_a?(Document)
215
+ fdoc = doc
216
+ else
217
+ raise ArgumentError, "Unknown document type #{doc.class}"
194
218
  end
195
- elsif doc.is_a?(Hash)
196
- fdoc = Document.new
197
- doc.each_pair() do |field, text|
198
- fdoc << Field.new(field.to_s, text.to_s,
199
- Field::Store::YES, Field::Index::TOKENIZED)
200
- end
201
- elsif doc.is_a?(Document)
202
- fdoc = doc
203
- else
204
- raise ArgumentError, "Unknown document type #{doc.class}"
205
- end
206
- @has_writes = true
219
+ @has_writes = true
207
220
 
208
- @writer.add_document(fdoc, analyzer || @writer.analyzer)
221
+ @writer.add_document(fdoc, analyzer || @writer.analyzer)
222
+ end
209
223
  end
210
224
  alias :<< :add_document
211
225
 
@@ -213,24 +227,16 @@ module Ferret::Index
213
227
  # pass to this method. You can also pass a hash with one or more of the
214
228
  # following; {filter, num_docs, first_doc, sort}
215
229
  #
216
- # query:: the query to run on the index
217
- # filter:: filters docs from the search result
218
- # first_doc:: The index in the results of the first doc retrieved.
219
- # Default is 0
220
- # num_docs:: The number of results returned. Default is 10
221
- # sort:: an array of SortFields describing how to sort the results.
230
+ # query:: The query to run on the index
231
+ # filter:: Filters docs from the search result
232
+ # first_doc:: The index in the results of the first doc retrieved.
233
+ # Default is 0
234
+ # num_docs:: The number of results returned. Default is 10
235
+ # sort:: An array of SortFields describing how to sort the results.
222
236
  def search(query, options = {})
223
- ensure_searcher_open()
224
- if query.is_a?(String)
225
- if @qp.nil?
226
- @qp = Ferret::QueryParser.new(@default_search_field, @options)
227
- end
228
- # we need to set this ever time, in case a new field has been added
229
- @qp.fields = @reader.get_field_names.to_a
230
- query = @qp.parse(query)
237
+ @dir.synchronize do
238
+ return do_search(query, options)
231
239
  end
232
-
233
- return @searcher.search(query, options)
234
240
  end
235
241
 
236
242
  # See Index#search
@@ -241,9 +247,14 @@ module Ferret::Index
241
247
  # puts "hit document number #{doc} with a score of #{score}"
242
248
  # end
243
249
  #
250
+ # returns:: The total number of hits.
244
251
  def search_each(query, options = {}) # :yield: doc, score
245
- search(query, options).score_docs.each do |score_doc|
246
- yield score_doc.doc, score_doc.score
252
+ @dir.synchronize do
253
+ hits = do_search(query, options)
254
+ hits.score_docs.each do |score_doc|
255
+ yield score_doc.doc, score_doc.score
256
+ end
257
+ return hits.total_hits
247
258
  end
248
259
  end
249
260
 
@@ -253,14 +264,16 @@ module Ferret::Index
253
264
  # id:: The number of the document to retrieve, or the term used as the id
254
265
  # for the document we wish to retrieve
255
266
  def doc(id)
256
- ensure_reader_open()
257
- if id.is_a?(String)
258
- t = Term.new("id", id.to_s)
259
- return @reader.get_document_with_term(t)
260
- elsif id.is_a?(Term)
261
- return @reader.get_document_with_term(id)
262
- else
263
- return @reader.get_document(id)
267
+ @dir.synchronize do
268
+ ensure_reader_open()
269
+ if id.is_a?(String)
270
+ t = Term.new("id", id.to_s)
271
+ return @reader.get_document_with_term(t)
272
+ elsif id.is_a?(Term)
273
+ return @reader.get_document_with_term(id)
274
+ else
275
+ return @reader.get_document(id)
276
+ end
264
277
  end
265
278
  end
266
279
  alias :[] :doc
@@ -271,28 +284,34 @@ module Ferret::Index
271
284
  #
272
285
  # id:: The number of the document to delete
273
286
  def delete(id)
274
- ensure_reader_open()
275
- if id.is_a?(String)
276
- t = Term.new("id", id.to_s)
277
- return @reader.delete_docs_with_term(t)
278
- elsif id.is_a?(Term)
279
- return @reader.delete_docs_with_term(id)
280
- else
281
- return @reader.delete(id)
287
+ @dir.synchronize do
288
+ ensure_reader_open()
289
+ if id.is_a?(String)
290
+ t = Term.new("id", id.to_s)
291
+ return @reader.delete_docs_with_term(t)
292
+ elsif id.is_a?(Term)
293
+ return @reader.delete_docs_with_term(id)
294
+ else
295
+ return @reader.delete(id)
296
+ end
282
297
  end
283
298
  end
284
299
 
285
300
  # Returns true if document +n+ has been deleted
286
301
  def deleted?(n)
287
- ensure_reader_open()
288
- return @reader.deleted?(n)
302
+ @dir.synchronize do
303
+ ensure_reader_open()
304
+ return @reader.deleted?(n)
305
+ end
289
306
  end
290
307
 
291
308
  # Returns true if any documents have been deleted since the index was last
292
309
  # flushed.
293
310
  def has_deletions?()
294
- ensure_reader_open()
295
- return @reader.has_deletions?
311
+ @dir.synchronize do
312
+ ensure_reader_open()
313
+ return @reader.has_deletions?
314
+ end
296
315
  end
297
316
 
298
317
  # Returns true if any documents have been added to the index since the
@@ -301,18 +320,102 @@ module Ferret::Index
301
320
  return @has_writes
302
321
  end
303
322
 
323
+ # Flushes all writes to the index. This will not optimize the index but it
324
+ # will make sure that all writes are written to it.
325
+ #
326
+ # NOTE: this is not necessary if you are only using this class. All writes
327
+ # will automatically flush when you perform an operation that reads the
328
+ # index.
329
+ def flush()
330
+ @dir.synchronize do
331
+ @reader.close if @reader
332
+ @writer.close if @writer
333
+ @reader = nil
334
+ @writer = nil
335
+ @searcher = nil
336
+ end
337
+ end
338
+
304
339
  # optimizes the index. This should only be called when the index will no
305
340
  # longer be updated very often, but will be read a lot.
306
341
  def optimize()
307
- ensure_writer_open()
308
- @writer.optimize()
309
- @modified = true
342
+ @dir.synchronize do
343
+ ensure_writer_open()
344
+ @writer.optimize()
345
+ @modified = true
346
+ end
310
347
  end
311
348
 
312
349
  # returns the number of documents in the index
313
350
  def size()
314
- ensure_reader_open()
315
- return @reader.num_docs()
351
+ @dir.synchronize do
352
+ ensure_reader_open()
353
+ return @reader.num_docs()
354
+ end
355
+ end
356
+
357
+ # Merges all segments from an index or an array of indexes into this
358
+ # index. You can pass a single Index::Index, Index::Reader,
359
+ # Store::Directory or an array of any single one of these.
360
+ #
361
+ # This may be used to parallelize batch indexing. A large document
362
+ # collection can be broken into sub-collections. Each sub-collection can
363
+ # be indexed in parallel, on a different thread, process or machine and
364
+ # perhaps all in memory. The complete index can then be created by
365
+ # merging sub-collection indexes with this method.
366
+ #
367
+ # After this completes, the index is optimized.
368
+ def add_indexes(indexes)
369
+ @dir.synchronize do
370
+ indexes = [indexes].flatten # make sure we have an array
371
+ return if indexes.size == 0 # nothing to do
372
+ if indexes[0].is_a?(Index)
373
+ readers = indexes.map {|index| index.reader }
374
+ indexes = readers
375
+ end
376
+
377
+ if indexes[0].is_a?(IndexReader)
378
+ ensure_reader_open
379
+ indexes.delete(@reader) # we don't want to merge with self
380
+ ensure_writer_open
381
+ @writer.add_indexes_readers(indexes)
382
+ elsif indexes[0].is_a?(Ferret::Store::Directory)
383
+ indexes.delete(@dir) # we don't want to merge with self
384
+ ensure_writer_open
385
+ @writer.add_indexes(indexes)
386
+ else
387
+ raise ArgumentError, "Unknown index type when trying to merge indexes"
388
+ end
389
+ end
390
+ end
391
+
392
+ # This is a simple utility method for saving an in memory or RAM index to
393
+ # the file system. The same thing can be achieved by using the
394
+ # Index::Index#add_indexes method and you will have more options when
395
+ # creating the new index, however this is a simple way to turn a RAM index
396
+ # into a file system index.
397
+ #
398
+ # directory:: This can either be a Store::Directory object or a string
399
+ # representing the path to the directory where you would
400
+ # like to store the the index.
401
+ #
402
+ # create:: True if you'd like to create the directory if it doesn't
403
+ # exist or copy over an existing directory. False if you'd
404
+ # like to merge with the existing directory. This defaults to
405
+ # false.
406
+ def persist(directory, create = true)
407
+ synchronize do
408
+ flush
409
+ old_dir = @dir
410
+ if directory.is_a?(String)
411
+ @dir = FSDirectory.new(directory, create)
412
+ @options[:close_dir] = true
413
+ elsif directory.is_a?(Ferret::Store::Directory)
414
+ @dir = directory
415
+ end
416
+ ensure_writer_open
417
+ @writer.add_indexes([old_dir])
418
+ end
316
419
  end
317
420
 
318
421
  protected
@@ -343,5 +446,20 @@ module Ferret::Index
343
446
  ensure_reader_open()
344
447
  @searcher = IndexSearcher.new(@reader)
345
448
  end
449
+
450
+ private
451
+ def do_search(query, options)
452
+ ensure_searcher_open()
453
+ if query.is_a?(String)
454
+ if @qp.nil?
455
+ @qp = Ferret::QueryParser.new(@default_search_field, @options)
456
+ end
457
+ # we need to set this ever time, in case a new field has been added
458
+ @qp.fields = @reader.get_field_names.to_a
459
+ query = @qp.parse(query)
460
+ end
461
+
462
+ return @searcher.search(query, options)
463
+ end
346
464
  end
347
465
  end