ferret 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/Rakefile +1 -1
  2. data/TODO +3 -0
  3. data/ext/dummy.exe +0 -0
  4. data/lib/ferret.rb +1 -1
  5. data/lib/ferret/analysis/token.rb +6 -0
  6. data/lib/ferret/analysis/tokenizers.rb +5 -5
  7. data/lib/ferret/document/document.rb +10 -13
  8. data/lib/ferret/index/compound_file_io.rb +12 -9
  9. data/lib/ferret/index/field_infos.rb +0 -6
  10. data/lib/ferret/index/index.rb +220 -102
  11. data/lib/ferret/index/index_reader.rb +22 -2
  12. data/lib/ferret/index/index_writer.rb +55 -14
  13. data/lib/ferret/index/multi_reader.rb +279 -279
  14. data/lib/ferret/index/segment_infos.rb +3 -3
  15. data/lib/ferret/index/segment_merger.rb +7 -6
  16. data/lib/ferret/index/segment_reader.rb +23 -7
  17. data/lib/ferret/index/segment_term_enum.rb +6 -7
  18. data/lib/ferret/index/term_buffer.rb +3 -5
  19. data/lib/ferret/index/term_doc_enum.rb +7 -2
  20. data/lib/ferret/index/term_infos_io.rb +15 -8
  21. data/lib/ferret/query_parser/query_parser.tab.rb +49 -45
  22. data/lib/ferret/search/boolean_query.rb +3 -4
  23. data/lib/ferret/search/boolean_scorer.rb +11 -11
  24. data/lib/ferret/search/caching_wrapper_filter.rb +1 -1
  25. data/lib/ferret/search/disjunction_sum_scorer.rb +9 -7
  26. data/lib/ferret/search/field_cache.rb +1 -2
  27. data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
  28. data/lib/ferret/search/fuzzy_term_enum.rb +64 -58
  29. data/lib/ferret/search/index_searcher.rb +16 -9
  30. data/lib/ferret/search/prefix_query.rb +7 -0
  31. data/lib/ferret/search/query_filter.rb +1 -1
  32. data/lib/ferret/search/term_scorer.rb +5 -1
  33. data/lib/ferret/search/top_docs.rb +12 -0
  34. data/lib/ferret/store/buffered_index_io.rb +5 -6
  35. data/lib/ferret/store/fs_store.rb +47 -33
  36. data/lib/ferret/store/ram_store.rb +2 -2
  37. data/lib/ferret/utils.rb +1 -0
  38. data/lib/ferret/utils/bit_vector.rb +20 -2
  39. data/lib/ferret/utils/thread_local.rb +28 -0
  40. data/lib/ferret/utils/weak_key_hash.rb +11 -2
  41. data/test/benchmark/tb_rw_vint.rb +1 -1
  42. data/test/functional/thread_safety_index_test.rb +81 -0
  43. data/test/functional/thread_safety_test.rb +137 -0
  44. data/test/test_all.rb +3 -7
  45. data/test/test_helper.rb +2 -1
  46. data/test/unit/index/tc_compound_file_io.rb +2 -2
  47. data/test/unit/index/tc_index.rb +128 -6
  48. data/test/unit/index/tc_index_reader.rb +1 -1
  49. data/test/unit/index/tc_segment_infos.rb +1 -1
  50. data/test/unit/index/th_doc.rb +1 -1
  51. data/test/unit/search/tc_index_searcher.rb +6 -0
  52. data/test/unit/store/tc_fs_store.rb +3 -3
  53. data/test/unit/utils/tc_bit_vector.rb +8 -0
  54. data/test/unit/utils/tc_thread.rb +61 -0
  55. data/test/unit/utils/tc_weak_key_hash.rb +2 -2
  56. data/test/utils/number_to_spoken.rb +132 -0
  57. metadata +7 -2
data/Rakefile CHANGED
@@ -196,7 +196,7 @@ end
196
196
 
197
197
  desc "Make a new release"
198
198
  task :prerelease => [:clobber, :all_tests, :parsers]
199
- #task :package => [:prerelease]
199
+ task :package => [:prerelease]
200
200
  task :tag => [:prerelease]
201
201
  task :update_version => [:prerelease]
202
202
  task :release => [:tag, :update_version, :package] do
data/TODO CHANGED
@@ -5,8 +5,11 @@ Send suggestions for this list to mailto:dbalmain@gmail.com
5
5
  === To Do
6
6
 
7
7
  * Add the ability to persist an in memory index to Ferret::Index::Index
8
+ * Make a dll for people on Windows
8
9
 
9
10
  === Done
10
11
 
11
12
  * Add UTF-8 support
12
13
  * Multi Field Query
14
+ * Test threading
15
+ * Compile a proper dummy executable
data/ext/dummy.exe CHANGED
Binary file
data/lib/ferret.rb CHANGED
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.1.3'
25
+ VERSION = '0.1.4'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -35,6 +35,12 @@ module Ferret::Analysis
35
35
  @position_increment = pos_inc
36
36
  end
37
37
 
38
+ def eql?(o)
39
+ return (o.instance_of?(Token) and @start_offset == o.start_offset and
40
+ @end_offset == o.end_offset and @term_text = o.term_text)
41
+ end
42
+ alias :== :eql?
43
+
38
44
  # Tokens are sorted by the position in the text at which they occur, ie
39
45
  # the start_offset. If two tokens have the same start offset, (see
40
46
  # position_increment=) then, they are sorted by the end_offset and then
@@ -26,7 +26,7 @@ module Ferret::Analysis
26
26
  #
27
27
  # class LetterTokenizer < RegExpTokenizer
28
28
  # def token_re()
29
- # /[a-zA-Z]+/
29
+ # /[[:alpha:]]+/
30
30
  # end
31
31
  # end
32
32
  class RegExpTokenizer < Tokenizer
@@ -63,7 +63,7 @@ module Ferret::Analysis
63
63
  protected
64
64
  # returns the regular expression used to find the next token
65
65
  def token_re
66
- /[a-zA-Z]+/
66
+ /[[:alpha:]]+/
67
67
  end
68
68
 
69
69
  # Called on each token to normalize it before it is added to the
@@ -75,13 +75,13 @@ module Ferret::Analysis
75
75
 
76
76
  # A LetterTokenizer is a tokenizer that divides text at non-letters.
77
77
  # That's to say, it defines tokens as maximal strings of adjacent letters,
78
- # as defined by the regular expression _/[a-zA-Z]+/_.
78
+ # as defined by the regular expression _/[[:alpha:]]+/_.
79
79
  class LetterTokenizer < RegExpTokenizer
80
80
  protected
81
81
  # Collects only characters which satisfy the regular expression
82
- # _/[a-zA-Z]+/_.
82
+ # _/[[:alpha:]]+/_.
83
83
  def token_re()
84
- /[a-zA-Z]+/
84
+ /[[:alpha:]]+/
85
85
  end
86
86
  end
87
87
 
@@ -69,13 +69,13 @@ module Ferret::Document
69
69
  # document has to be deleted from an index and a new changed version of
70
70
  # that document has to be added.
71
71
  def add_field(field)
72
- (@fields[field.name] ||= []) << field
72
+ (@fields[field.name.to_s] ||= []) << field
73
73
  end
74
74
  alias :<< :add_field
75
75
 
76
76
  # Removes the first field of this name if it exists.
77
77
  def remove_field(name)
78
- @fields[name].delete_at(0)
78
+ @fields[name.to_s].delete_at(0)
79
79
  end
80
80
 
81
81
  # Removes all fields with the given name from the document.
@@ -89,7 +89,7 @@ module Ferret::Document
89
89
  # this, a document has to be deleted from an index and a new changed
90
90
  # version of that document has to be added.
91
91
  def remove_fields(name)
92
- @fields.delete(name)
92
+ @fields.delete(name.to_s)
93
93
  end
94
94
 
95
95
  # Returns the first field with the given name.
@@ -98,7 +98,7 @@ module Ferret::Document
98
98
  # name:: the name of the field
99
99
  # Return:: a _Field_ array
100
100
  def field(name)
101
- @fields[name] ? @fields[name][0] : nil
101
+ @fields[name.to_s] ? @fields[name.to_s][0] : nil
102
102
  end
103
103
 
104
104
  # Returns an array of all fields with the given name.
@@ -107,7 +107,7 @@ module Ferret::Document
107
107
  # name:: the name of the field
108
108
  # Return:: a _Field_ array
109
109
  def fields(name)
110
- @fields[name]
110
+ @fields[name.to_s]
111
111
  end
112
112
 
113
113
  # Returns an array of values of the field specified as the method
@@ -116,8 +116,8 @@ module Ferret::Document
116
116
  # name:: the name of the field
117
117
  # Return:: a _String_ of field values
118
118
  def values(name)
119
- return nil if @fields[name].nil?
120
- @fields[name].map {|f| f.data if not f.binary? }.join(" ")
119
+ return nil if @fields[name.to_s].nil?
120
+ @fields[name.to_s].map {|f| f.data if not f.binary? }.join(" ")
121
121
  end
122
122
  alias :[] :values
123
123
 
@@ -125,7 +125,7 @@ module Ferret::Document
125
125
  # field of that name then it will set the data in the first field of that
126
126
  # name.
127
127
  def []=(field_name, data)
128
- field = field(field_name)
128
+ field = field(field_name.to_s)
129
129
  raise ArgumentError, "Field does not exist" unless field
130
130
  field.data = data
131
131
  end
@@ -137,16 +137,13 @@ module Ferret::Document
137
137
  # Return:: a _String_ of field values
138
138
  def binaries(name)
139
139
  binaries = []
140
- @fields[name].each {|f| binaries << f.data if f.binary? }
140
+ @fields[name.to_s].each {|f| binaries << f.data if f.binary? }
141
141
  return binaries
142
142
  end
143
143
 
144
144
  # Prints the fields of a document for human consumption.#/
145
145
  def to_s()
146
- field_str = ""
147
- @fields.each_key { |name| field_str += name + " " }
148
- field_str[-1] = ">"
149
- return "Document<" + field_str
146
+ return "Document<#{@fields.keys.join(" ")}>"
150
147
  end
151
148
  end
152
149
  end
@@ -92,7 +92,7 @@ module Ferret::Index
92
92
  end
93
93
 
94
94
  # Returns true iff a file with the given name exists.
95
- def file_exists(name)
95
+ def exists?(name)
96
96
  return @entries.key?(name)
97
97
  end
98
98
 
@@ -113,7 +113,7 @@ module Ferret::Index
113
113
  def rename(from, to) raise(UnsupportedOperationError) end
114
114
 
115
115
  # Returns the length of a file in the directory.
116
- def file_length(name)
116
+ def length(name)
117
117
  e = @entries[name]
118
118
  if (e == nil): raise(IOError, "File " + name + " does not exist") end
119
119
  return e.length
@@ -188,6 +188,9 @@ module Ferret::Index
188
188
  # data section, and a UTF String with that file's extension.
189
189
  class CompoundFileWriter
190
190
 
191
+ class StateError < Exception
192
+ end
193
+
191
194
  attr_reader :directory, :file_name
192
195
 
193
196
  # Create the compound stream in the specified file. The file name is the
@@ -203,16 +206,16 @@ module Ferret::Index
203
206
  # Add a source stream. _file_name_ is the string by which the
204
207
  # sub-stream will be known in the compound stream.
205
208
  #
206
- # Throws:: IllegalStateError if this writer is closed
207
- # Throws:: IllegalArgumentError if a file with the same name
209
+ # Throws:: StateError if this writer is closed
210
+ # Throws:: ArgumentError if a file with the same name
208
211
  # has been added already
209
212
  def add_file(file_name)
210
213
  if @merged
211
- raise(IllegalStateError, "Can't add extensions after merge has been called")
214
+ raise(StateError, "Can't add extensions after merge has been called")
212
215
  end
213
216
 
214
217
  if not @ids.add?(file_name)
215
- raise(IllegalArgumentError, "File " + file + " already added")
218
+ raise(ArgumentError, "File #{file_name} already added")
216
219
  end
217
220
 
218
221
  entry = FileEntry.new(file_name)
@@ -224,16 +227,16 @@ module Ferret::Index
224
227
  # compound stream. After successful merge, the source files
225
228
  # are deleted.
226
229
  #
227
- # Throws:: IllegalStateException if close() had been called before or
230
+ # Throws:: StateException if close() had been called before or
228
231
  # if no file has been added to this object
229
232
  def close()
230
233
 
231
234
  if @merged
232
- raise(IllegalStateException, "Merge already performed")
235
+ raise(StateException, "Merge already performed")
233
236
  end
234
237
 
235
238
  if @file_entries.empty?
236
- raise(IllegalStateException, "No entries to merge have been defined")
239
+ raise(StateException, "No entries to merge have been defined")
237
240
  end
238
241
 
239
242
  @merged = true
@@ -27,12 +27,6 @@ module Ferret
27
27
  end
28
28
  end
29
29
 
30
- # Returns the number of fields that have been added to this field infos
31
- # object.
32
- def size
33
- return @fi_array.size
34
- end
35
-
36
30
  # Automatically adds all of the fields from the document if they haven't
37
31
  # been added already. Or it will update the values.
38
32
  def add_doc_fields(doc)
@@ -1,7 +1,11 @@
1
+ require 'monitor'
2
+
1
3
  module Ferret::Index
2
4
  # This is a simplified interface to the index. See the TUTORIAL for more
3
5
  # information on how to use this class.
4
6
  class Index
7
+ include MonitorMixin
8
+
5
9
  include Ferret::Store
6
10
  include Ferret::Search
7
11
  include Ferret::Document
@@ -77,9 +81,10 @@ module Ferret::Index
77
81
  # :default_slop => 2)
78
82
  #
79
83
  def initialize(options = {})
84
+ super()
85
+ options[:create_if_missing] = true if options[:create_if_missing].nil?
80
86
  if options[:path]
81
- options[:create_if_missing] = true if options[:create_if_missing].nil?
82
- @dir = FSDirectory.new(options[:path], true)
87
+ @dir = FSDirectory.new(options[:path], options[:create])
83
88
  options[:close_dir] = true
84
89
  elsif options[:dir]
85
90
  @dir = options[:dir]
@@ -88,29 +93,34 @@ module Ferret::Index
88
93
  @dir = RAMDirectory.new
89
94
  end
90
95
 
91
- @options = options
92
- @writer = IndexWriter.new(@dir, options)
93
- options[:analyzer] = @analyzer = @writer.analyzer
94
- @has_writes = false
95
- @reader = nil
96
- @options.delete(:create) # only want to create the first time if at all
97
- @close_dir = @options.delete(:close_dir) || false # we'll hold this here
98
- @default_search_field = (@options[:default_search_field] || \
99
- @options[:default_field] || "*")
100
- @default_field = @options[:default_field] || ""
101
- @open = true
96
+ @dir.synchronize do
97
+ @options = options
98
+ @writer = IndexWriter.new(@dir, options)
99
+ options[:analyzer] = @analyzer = @writer.analyzer
100
+ @has_writes = false
101
+ @reader = nil
102
+ @options.delete(:create) # only want to create the first time if at all
103
+ @close_dir = @options.delete(:close_dir) || false # we'll hold this here
104
+ @default_search_field = (@options[:default_search_field] || \
105
+ @options[:default_field] || "*")
106
+ @default_field = @options[:default_field] || ""
107
+ @open = true
108
+ @qp = nil
109
+ end
102
110
  end
103
111
 
104
112
  # Closes this index by closing its associated reader and writer objects.
105
113
  def close
106
- if not @open
107
- raise "tried to close an already closed directory"
108
- end
109
- @reader.close() if @reader
110
- @writer.close() if @writer
111
- @dir.close()
114
+ @dir.synchronize do
115
+ if not @open
116
+ raise "tried to close an already closed directory"
117
+ end
118
+ @reader.close() if @reader
119
+ @writer.close() if @writer
120
+ @dir.close()
112
121
 
113
- @open = false
122
+ @open = false
123
+ end
114
124
  end
115
125
 
116
126
  # Get the reader for this index.
@@ -133,6 +143,7 @@ module Ferret::Index
133
143
  ensure_writer_open()
134
144
  return @writer
135
145
  end
146
+ protected :reader, :writer, :searcher
136
147
 
137
148
  # Adds a document to this index, using the provided analyzer instead of
138
149
  # the local analyzer if provided. If the document contains more than
@@ -147,27 +158,28 @@ module Ferret::Index
147
158
  # index << "This is a new document to be indexed"
148
159
  # index << ["And here", "is another", "new document", "to be indexed"]
149
160
  #
150
- # But these are pretty simple documents. If this is all you want to index you
151
- # could probably just use SimpleSearch. So let's give our documents some fields;
161
+ # But these are pretty simple documents. If this is all you want to index
162
+ # you could probably just use SimpleSearch. So let's give our documents
163
+ # some fields;
152
164
  #
153
165
  # index << {:title => "Programming Ruby", :content => "blah blah blah"}
154
166
  # index << {:title => "Programming Ruby", :content => "yada yada yada"}
155
167
  #
156
- # Or if you are indexing data stored in a database, you'll probably want to
157
- # store the id;
168
+ # Or if you are indexing data stored in a database, you'll probably want
169
+ # to store the id;
158
170
  #
159
171
  # index << {:id => row.id, :title => row.title, :date => row.date}
160
172
  #
161
- # The methods above while store all of the input data as well tokenizing and
162
- # indexing it. Sometimes we won't want to tokenize (divide the string into
163
- # tokens) the data. For example, we might want to leave the title as a complete
164
- # string and only allow searchs for that complete string. Sometimes we won't
165
- # want to store the data as it's already stored in the database so it'll be a
166
- # waste to store it in the index. Or perhaps we are doing without a database and
167
- # using Ferret to store all of our data, in which case we might not want to
168
- # index it. For example, if we are storing images in the index, we won't want to
169
- # index them. All of this can be done using Ferret's Ferret::Document module.
170
- # eg;
173
+ # The methods above while store all of the input data as well tokenizing
174
+ # and indexing it. Sometimes we won't want to tokenize (divide the string
175
+ # into tokens) the data. For example, we might want to leave the title as
176
+ # a complete string and only allow searchs for that complete string.
177
+ # Sometimes we won't want to store the data as it's already stored in the
178
+ # database so it'll be a waste to store it in the index. Or perhaps we are
179
+ # doing without a database and using Ferret to store all of our data, in
180
+ # which case we might not want to index it. For example, if we are storing
181
+ # images in the index, we won't want to index them. All of this can be
182
+ # done using Ferret's Ferret::Document module. eg;
171
183
  #
172
184
  # include Ferret::Document
173
185
  # doc = Document.new
@@ -177,35 +189,37 @@ module Ferret::Index
177
189
  # doc << Field.new("image", row.image, Field::Store::YES, Field::Index::NO)
178
190
  # index << doc
179
191
  #
180
- # You can also compress the data that you are storing or store term vectors with
181
- # the data. Read more about this in Ferret::Document::Field.
192
+ # You can also compress the data that you are storing or store term
193
+ # vectors with the data. Read more about this in Ferret::Document::Field.
182
194
  def add_document(doc, analyzer = nil)
183
- ensure_writer_open()
184
- fdoc = nil
185
- if doc.is_a?(String)
186
- fdoc = Document.new
187
- fdoc << Field.new(@default_field, doc,
188
- Field::Store::YES, Field::Index::TOKENIZED)
189
- elsif doc.is_a?(Array)
190
- fdoc = Document.new
191
- doc.each() do |field|
192
- fdoc << Field.new(@default_field, field,
195
+ @dir.synchronize do
196
+ ensure_writer_open()
197
+ fdoc = nil
198
+ if doc.is_a?(String)
199
+ fdoc = Document.new
200
+ fdoc << Field.new(@default_field, doc,
193
201
  Field::Store::YES, Field::Index::TOKENIZED)
202
+ elsif doc.is_a?(Array)
203
+ fdoc = Document.new
204
+ doc.each() do |field|
205
+ fdoc << Field.new(@default_field, field,
206
+ Field::Store::YES, Field::Index::TOKENIZED)
207
+ end
208
+ elsif doc.is_a?(Hash)
209
+ fdoc = Document.new
210
+ doc.each_pair() do |field, text|
211
+ fdoc << Field.new(field.to_s, text.to_s,
212
+ Field::Store::YES, Field::Index::TOKENIZED)
213
+ end
214
+ elsif doc.is_a?(Document)
215
+ fdoc = doc
216
+ else
217
+ raise ArgumentError, "Unknown document type #{doc.class}"
194
218
  end
195
- elsif doc.is_a?(Hash)
196
- fdoc = Document.new
197
- doc.each_pair() do |field, text|
198
- fdoc << Field.new(field.to_s, text.to_s,
199
- Field::Store::YES, Field::Index::TOKENIZED)
200
- end
201
- elsif doc.is_a?(Document)
202
- fdoc = doc
203
- else
204
- raise ArgumentError, "Unknown document type #{doc.class}"
205
- end
206
- @has_writes = true
219
+ @has_writes = true
207
220
 
208
- @writer.add_document(fdoc, analyzer || @writer.analyzer)
221
+ @writer.add_document(fdoc, analyzer || @writer.analyzer)
222
+ end
209
223
  end
210
224
  alias :<< :add_document
211
225
 
@@ -213,24 +227,16 @@ module Ferret::Index
213
227
  # pass to this method. You can also pass a hash with one or more of the
214
228
  # following; {filter, num_docs, first_doc, sort}
215
229
  #
216
- # query:: the query to run on the index
217
- # filter:: filters docs from the search result
218
- # first_doc:: The index in the results of the first doc retrieved.
219
- # Default is 0
220
- # num_docs:: The number of results returned. Default is 10
221
- # sort:: an array of SortFields describing how to sort the results.
230
+ # query:: The query to run on the index
231
+ # filter:: Filters docs from the search result
232
+ # first_doc:: The index in the results of the first doc retrieved.
233
+ # Default is 0
234
+ # num_docs:: The number of results returned. Default is 10
235
+ # sort:: An array of SortFields describing how to sort the results.
222
236
  def search(query, options = {})
223
- ensure_searcher_open()
224
- if query.is_a?(String)
225
- if @qp.nil?
226
- @qp = Ferret::QueryParser.new(@default_search_field, @options)
227
- end
228
- # we need to set this ever time, in case a new field has been added
229
- @qp.fields = @reader.get_field_names.to_a
230
- query = @qp.parse(query)
237
+ @dir.synchronize do
238
+ return do_search(query, options)
231
239
  end
232
-
233
- return @searcher.search(query, options)
234
240
  end
235
241
 
236
242
  # See Index#search
@@ -241,9 +247,14 @@ module Ferret::Index
241
247
  # puts "hit document number #{doc} with a score of #{score}"
242
248
  # end
243
249
  #
250
+ # returns:: The total number of hits.
244
251
  def search_each(query, options = {}) # :yield: doc, score
245
- search(query, options).score_docs.each do |score_doc|
246
- yield score_doc.doc, score_doc.score
252
+ @dir.synchronize do
253
+ hits = do_search(query, options)
254
+ hits.score_docs.each do |score_doc|
255
+ yield score_doc.doc, score_doc.score
256
+ end
257
+ return hits.total_hits
247
258
  end
248
259
  end
249
260
 
@@ -253,14 +264,16 @@ module Ferret::Index
253
264
  # id:: The number of the document to retrieve, or the term used as the id
254
265
  # for the document we wish to retrieve
255
266
  def doc(id)
256
- ensure_reader_open()
257
- if id.is_a?(String)
258
- t = Term.new("id", id.to_s)
259
- return @reader.get_document_with_term(t)
260
- elsif id.is_a?(Term)
261
- return @reader.get_document_with_term(id)
262
- else
263
- return @reader.get_document(id)
267
+ @dir.synchronize do
268
+ ensure_reader_open()
269
+ if id.is_a?(String)
270
+ t = Term.new("id", id.to_s)
271
+ return @reader.get_document_with_term(t)
272
+ elsif id.is_a?(Term)
273
+ return @reader.get_document_with_term(id)
274
+ else
275
+ return @reader.get_document(id)
276
+ end
264
277
  end
265
278
  end
266
279
  alias :[] :doc
@@ -271,28 +284,34 @@ module Ferret::Index
271
284
  #
272
285
  # id:: The number of the document to delete
273
286
  def delete(id)
274
- ensure_reader_open()
275
- if id.is_a?(String)
276
- t = Term.new("id", id.to_s)
277
- return @reader.delete_docs_with_term(t)
278
- elsif id.is_a?(Term)
279
- return @reader.delete_docs_with_term(id)
280
- else
281
- return @reader.delete(id)
287
+ @dir.synchronize do
288
+ ensure_reader_open()
289
+ if id.is_a?(String)
290
+ t = Term.new("id", id.to_s)
291
+ return @reader.delete_docs_with_term(t)
292
+ elsif id.is_a?(Term)
293
+ return @reader.delete_docs_with_term(id)
294
+ else
295
+ return @reader.delete(id)
296
+ end
282
297
  end
283
298
  end
284
299
 
285
300
  # Returns true if document +n+ has been deleted
286
301
  def deleted?(n)
287
- ensure_reader_open()
288
- return @reader.deleted?(n)
302
+ @dir.synchronize do
303
+ ensure_reader_open()
304
+ return @reader.deleted?(n)
305
+ end
289
306
  end
290
307
 
291
308
  # Returns true if any documents have been deleted since the index was last
292
309
  # flushed.
293
310
  def has_deletions?()
294
- ensure_reader_open()
295
- return @reader.has_deletions?
311
+ @dir.synchronize do
312
+ ensure_reader_open()
313
+ return @reader.has_deletions?
314
+ end
296
315
  end
297
316
 
298
317
  # Returns true if any documents have been added to the index since the
@@ -301,18 +320,102 @@ module Ferret::Index
301
320
  return @has_writes
302
321
  end
303
322
 
323
+ # Flushes all writes to the index. This will not optimize the index but it
324
+ # will make sure that all writes are written to it.
325
+ #
326
+ # NOTE: this is not necessary if you are only using this class. All writes
327
+ # will automatically flush when you perform an operation that reads the
328
+ # index.
329
+ def flush()
330
+ @dir.synchronize do
331
+ @reader.close if @reader
332
+ @writer.close if @writer
333
+ @reader = nil
334
+ @writer = nil
335
+ @searcher = nil
336
+ end
337
+ end
338
+
304
339
  # optimizes the index. This should only be called when the index will no
305
340
  # longer be updated very often, but will be read a lot.
306
341
  def optimize()
307
- ensure_writer_open()
308
- @writer.optimize()
309
- @modified = true
342
+ @dir.synchronize do
343
+ ensure_writer_open()
344
+ @writer.optimize()
345
+ @modified = true
346
+ end
310
347
  end
311
348
 
312
349
  # returns the number of documents in the index
313
350
  def size()
314
- ensure_reader_open()
315
- return @reader.num_docs()
351
+ @dir.synchronize do
352
+ ensure_reader_open()
353
+ return @reader.num_docs()
354
+ end
355
+ end
356
+
357
+ # Merges all segments from an index or an array of indexes into this
358
+ # index. You can pass a single Index::Index, Index::Reader,
359
+ # Store::Directory or an array of any single one of these.
360
+ #
361
+ # This may be used to parallelize batch indexing. A large document
362
+ # collection can be broken into sub-collections. Each sub-collection can
363
+ # be indexed in parallel, on a different thread, process or machine and
364
+ # perhaps all in memory. The complete index can then be created by
365
+ # merging sub-collection indexes with this method.
366
+ #
367
+ # After this completes, the index is optimized.
368
+ def add_indexes(indexes)
369
+ @dir.synchronize do
370
+ indexes = [indexes].flatten # make sure we have an array
371
+ return if indexes.size == 0 # nothing to do
372
+ if indexes[0].is_a?(Index)
373
+ readers = indexes.map {|index| index.reader }
374
+ indexes = readers
375
+ end
376
+
377
+ if indexes[0].is_a?(IndexReader)
378
+ ensure_reader_open
379
+ indexes.delete(@reader) # we don't want to merge with self
380
+ ensure_writer_open
381
+ @writer.add_indexes_readers(indexes)
382
+ elsif indexes[0].is_a?(Ferret::Store::Directory)
383
+ indexes.delete(@dir) # we don't want to merge with self
384
+ ensure_writer_open
385
+ @writer.add_indexes(indexes)
386
+ else
387
+ raise ArgumentError, "Unknown index type when trying to merge indexes"
388
+ end
389
+ end
390
+ end
391
+
392
+ # This is a simple utility method for saving an in memory or RAM index to
393
+ # the file system. The same thing can be achieved by using the
394
+ # Index::Index#add_indexes method and you will have more options when
395
+ # creating the new index, however this is a simple way to turn a RAM index
396
+ # into a file system index.
397
+ #
398
+ # directory:: This can either be a Store::Directory object or a string
399
+ # representing the path to the directory where you would
400
+ # like to store the the index.
401
+ #
402
+ # create:: True if you'd like to create the directory if it doesn't
403
+ # exist or copy over an existing directory. False if you'd
404
+ # like to merge with the existing directory. This defaults to
405
+ # false.
406
+ def persist(directory, create = true)
407
+ synchronize do
408
+ flush
409
+ old_dir = @dir
410
+ if directory.is_a?(String)
411
+ @dir = FSDirectory.new(directory, create)
412
+ @options[:close_dir] = true
413
+ elsif directory.is_a?(Ferret::Store::Directory)
414
+ @dir = directory
415
+ end
416
+ ensure_writer_open
417
+ @writer.add_indexes([old_dir])
418
+ end
316
419
  end
317
420
 
318
421
  protected
@@ -343,5 +446,20 @@ module Ferret::Index
343
446
  ensure_reader_open()
344
447
  @searcher = IndexSearcher.new(@reader)
345
448
  end
449
+
450
+ private
451
+ def do_search(query, options)
452
+ ensure_searcher_open()
453
+ if query.is_a?(String)
454
+ if @qp.nil?
455
+ @qp = Ferret::QueryParser.new(@default_search_field, @options)
456
+ end
457
+ # we need to set this ever time, in case a new field has been added
458
+ @qp.fields = @reader.get_field_names.to_a
459
+ query = @qp.parse(query)
460
+ end
461
+
462
+ return @searcher.search(query, options)
463
+ end
346
464
  end
347
465
  end