ferret 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/ext/Makefile +2 -2
  2. data/ext/ferret.c +27 -2
  3. data/ext/ferret.h +59 -16
  4. data/ext/ferret_ext.so +0 -0
  5. data/ext/index_io.c +72 -77
  6. data/ext/priority_queue.c +150 -145
  7. data/ext/ram_directory.c +47 -42
  8. data/ext/segment_merge_queue.c +4 -8
  9. data/ext/segment_term_enum.c +324 -0
  10. data/ext/similarity.c +59 -0
  11. data/ext/string_helper.c +2 -2
  12. data/ext/tags +150 -46
  13. data/ext/term.c +107 -152
  14. data/ext/term_buffer.c +105 -174
  15. data/ext/term_infos_reader.c +54 -0
  16. data/ext/terminfo.c +160 -0
  17. data/ext/token.c +93 -0
  18. data/lib/ferret.rb +1 -1
  19. data/lib/ferret/analysis/analyzers.rb +18 -0
  20. data/lib/ferret/analysis/standard_tokenizer.rb +19 -14
  21. data/lib/ferret/analysis/token.rb +8 -1
  22. data/lib/ferret/analysis/tokenizers.rb +10 -5
  23. data/lib/ferret/document/field.rb +33 -11
  24. data/lib/ferret/index/document_writer.rb +3 -2
  25. data/lib/ferret/index/field_infos.rb +38 -12
  26. data/lib/ferret/index/fields_io.rb +10 -4
  27. data/lib/ferret/index/index.rb +20 -4
  28. data/lib/ferret/index/index_reader.rb +19 -4
  29. data/lib/ferret/index/index_writer.rb +1 -1
  30. data/lib/ferret/index/multi_reader.rb +21 -7
  31. data/lib/ferret/index/segment_merge_info.rb +24 -22
  32. data/lib/ferret/index/segment_merge_queue.rb +2 -2
  33. data/lib/ferret/index/segment_merger.rb +28 -11
  34. data/lib/ferret/index/segment_reader.rb +19 -4
  35. data/lib/ferret/index/segment_term_enum.rb +3 -11
  36. data/lib/ferret/index/term_buffer.rb +13 -16
  37. data/lib/ferret/index/term_doc_enum.rb +8 -5
  38. data/lib/ferret/index/term_enum.rb +2 -2
  39. data/lib/ferret/index/term_info.rb +1 -5
  40. data/lib/ferret/index/term_infos_io.rb +2 -0
  41. data/lib/ferret/query_parser/query_parser.tab.rb +7 -7
  42. data/lib/ferret/search/phrase_scorer.rb +0 -1
  43. data/lib/ferret/search/similarity.rb +2 -2
  44. data/lib/ferret/search/term_scorer.rb +2 -2
  45. data/lib/ferret/store/directory.rb +2 -0
  46. data/lib/ferret/store/fs_store.rb +16 -3
  47. data/lib/ferret/store/ram_store.rb +2 -2
  48. data/test/unit/document/tc_field.rb +9 -0
  49. data/test/unit/index/tc_field_infos.rb +29 -21
  50. data/test/unit/index/tc_index.rb +44 -7
  51. data/test/unit/index/tc_term_buffer.rb +3 -3
  52. data/test/unit/index/tc_term_info.rb +1 -1
  53. data/test/unit/query_parser/tc_query_parser.rb +1 -1
  54. data/test/unit/search/tc_index_searcher.rb +3 -0
  55. data/test/unit/store/tc_fs_store.rb +47 -16
  56. data/test/unit/store/tc_ram_store.rb +1 -1
  57. metadata +8 -3
@@ -71,6 +71,12 @@ module Ferret::Document
71
71
  # field string
72
72
  def store_offsets?() return @store_offset end
73
73
 
74
+ # True if the norms are not stored for this field. No norms means that
75
+ # index-time boosting and field length normalization will be disabled.
76
+ # The benefit is less memory usage as norms take up one byte per indexed
77
+ # field for every document in the index.
78
+ def omit_norms?() return @omit_norms end
79
+
74
80
  class Store < Ferret::Utils::Parameter
75
81
  # Store the original field value in the index in a compressed form.
76
82
  # This is useful for long documents and for binary valued fields.
@@ -101,6 +107,13 @@ module Ferret::Document
101
107
  # searched. As no analyzer is used the value will be stored as a
102
108
  # single term. This is useful for unique Ids like product numbers.
103
109
  UNTOKENIZED = Index.new("UNTOKENIZED")
110
+
111
+ # Index the field's value without an Analyzer, and disable the storing
112
+ # of norms. No norms means that index-time boosting and field length
113
+ # normalization will be disabled. The benefit is less memory usage as
114
+ # norms take up one byte per indexed field for every document in the
115
+ # index.
116
+ NO_NORMS = Index.new("NO_NORMS");
104
117
  end
105
118
 
106
119
  class TermVector < Ferret::Utils::Parameter
@@ -174,13 +187,14 @@ module Ferret::Document
174
187
  end
175
188
 
176
189
  def stored=(stored)
177
- if (stored == Store::YES)
190
+ case stored
191
+ when Store::YES
178
192
  @stored = true
179
193
  @compressed = false
180
- elsif (stored == Store::COMPRESS)
194
+ when Store::COMPRESS
181
195
  @stored = true
182
196
  @compressed = true
183
- elsif (stored == Store::NO)
197
+ when Store::NO
184
198
  @stored = false
185
199
  @compressed = false
186
200
  else
@@ -189,38 +203,45 @@ module Ferret::Document
189
203
  end
190
204
 
191
205
  def index=(index)
192
- if (index == Index::NO)
206
+ @omit_norms = false
207
+ case index
208
+ when Index::NO
193
209
  @indexed = false
194
210
  @tokenized = false
195
- elsif (index == Index::TOKENIZED)
211
+ when Index::TOKENIZED
196
212
  @indexed = true
197
213
  @tokenized = true
198
- elsif (index == Index::UNTOKENIZED)
214
+ when Index::UNTOKENIZED
215
+ @indexed = true
216
+ @tokenized = false
217
+ when Index::NO_NORMS
199
218
  @indexed = true
200
219
  @tokenized = false
220
+ @omit_norms = true
201
221
  else
202
222
  raise "unknown stored parameter " + index.to_s
203
223
  end
204
224
  end
205
225
 
206
226
  def store_term_vector=(store_term_vector)
207
- if (store_term_vector == TermVector::NO)
227
+ case store_term_vector
228
+ when TermVector::NO
208
229
  @store_term_vector = false
209
230
  @store_position = false
210
231
  @store_offset = false
211
- elsif (store_term_vector == TermVector::YES)
232
+ when TermVector::YES
212
233
  @store_term_vector = true
213
234
  @store_position = false
214
235
  @store_offset = false
215
- elsif (store_term_vector == TermVector::WITH_POSITIONS)
236
+ when TermVector::WITH_POSITIONS
216
237
  @store_term_vector = true
217
238
  @store_position = true
218
239
  @store_offset = false
219
- elsif (store_term_vector == TermVector::WITH_OFFSETS)
240
+ when TermVector::WITH_OFFSETS
220
241
  @store_term_vector = true
221
242
  @store_position = false
222
243
  @store_offset = true
223
- elsif (store_term_vector == TermVector::WITH_POSITIONS_OFFSETS)
244
+ when TermVector::WITH_POSITIONS_OFFSETS
224
245
  @store_term_vector = true
225
246
  @store_position = true
226
247
  @store_offset = true
@@ -284,6 +305,7 @@ module Ferret::Document
284
305
  str << "store_term_vector," if (@store_term_vector)
285
306
  str << "tv_offset," if (@store_offset)
286
307
  str << "tv_position," if (@store_position)
308
+ str << "omit_norms," if (@omit_norms)
287
309
  str << "binary," if (@binary)
288
310
  str << "<#{@name}:#{data}>"
289
311
  end
@@ -92,9 +92,10 @@ module Ferret::Index
92
92
 
93
93
  length = @field_lengths[field_number] # length of field
94
94
  position = @field_positions[field_number] # position in field
95
+ position += @analyzer.position_increment_gap(field_name) if length > 0
95
96
  offset = @field_offsets[field_number] # offset field
96
97
 
97
- if field_info.indexed?
98
+ if field_info.indexed?
98
99
  if not field.tokenized? # un-tokenized field
99
100
  string_value = field.string_value
100
101
  if field_info.store_offsets?
@@ -261,7 +262,7 @@ module Ferret::Index
261
262
 
262
263
  def write_norms(segment)
263
264
  @field_infos.each_with_index do |fi, i|
264
- if fi.indexed?
265
+ if fi.indexed? and not fi.omit_norms?
265
266
  norm = @field_boosts[i] * @similarity.length_norm(fi.name, @field_lengths[i])
266
267
  norms = @directory.create_output(segment + ".f" + i.to_s)
267
268
  begin
@@ -35,7 +35,8 @@ module Ferret
35
35
  field.indexed?,
36
36
  field.store_term_vector?,
37
37
  field.store_positions?,
38
- field.store_offsets?)
38
+ field.store_offsets?,
39
+ field.omit_norms?)
39
40
  end
40
41
  end
41
42
  alias :<< :add_doc_fields
@@ -45,9 +46,11 @@ module Ferret
45
46
  indexed = true,
46
47
  store_term_vector = false,
47
48
  store_position = false,
48
- store_offset = false)
49
+ store_offset = false,
50
+ omit_norms = false)
49
51
  names.each do |name|
50
- add(name, indexed, store_term_vector, store_position, store_offset)
52
+ add(name, indexed, store_term_vector, store_position,
53
+ store_offset, omit_norms)
51
54
  end
52
55
  end
53
56
 
@@ -65,10 +68,12 @@ module Ferret
65
68
  indexed = true,
66
69
  store_term_vector = false,
67
70
  store_position = false,
68
- store_offset = false)
71
+ store_offset = false,
72
+ omit_norms = false)
69
73
  fi = @fi_hash[name]
70
74
  if (fi == nil)
71
- fi = add_internal(name, indexed, store_term_vector, store_position, store_offset)
75
+ fi = add_internal(name, indexed, store_term_vector, store_position,
76
+ store_offset, omit_norms)
72
77
  else
73
78
  if (fi.indexed? != indexed)
74
79
  fi.indexed = true # once indexed, always index
@@ -82,6 +87,9 @@ module Ferret
82
87
  if (fi.store_offsets? != store_offset)
83
88
  fi.store_offset = true # once vector, always vector
84
89
  end
90
+ if (fi.omit_norms? != omit_norms)
91
+ fi.omit_norms = false # once norms are stored, always store norms
92
+ end
85
93
  end
86
94
  return fi
87
95
  end
@@ -174,7 +182,9 @@ module Ferret
174
182
  store_term_vector = (bits & STORE_TERM_VECTOR) != 0
175
183
  store_position = (bits & STORE_POSITION) != 0
176
184
  store_offset = (bits & STORE_OFFSET) != 0
177
- add_internal(name, indexed, store_term_vector, store_position, store_offset)
185
+ omit_norms = (bits & OMIT_NORMS) != 0
186
+ add_internal(name, indexed, store_term_vector, store_position,
187
+ store_offset, omit_norms)
178
188
  end
179
189
  end
180
190
 
@@ -183,15 +193,18 @@ module Ferret
183
193
  STORE_TERM_VECTOR = 0x2;
184
194
  STORE_POSITION = 0x4;
185
195
  STORE_OFFSET = 0x8;
196
+ OMIT_NORMS = 0x10;
186
197
 
187
198
  def add_internal(name, indexed, store_term_vector,
188
199
  store_position = false,
189
- store_offset = false)
200
+ store_offset = false,
201
+ omit_norms = false)
190
202
  fi = FieldInfo.new(name, indexed,
191
203
  @fi_array.size(),
192
204
  store_term_vector,
193
205
  store_position,
194
- store_offset)
206
+ store_offset,
207
+ omit_norms)
195
208
  @fi_array << fi
196
209
  @fi_hash[name] = fi
197
210
  return fi
@@ -211,13 +224,17 @@ module Ferret
211
224
  if (fi.store_offsets?)
212
225
  bits |= STORE_OFFSET
213
226
  end
227
+ if (fi.omit_norms?)
228
+ bits |= OMIT_NORMS
229
+ end
214
230
  return bits
215
231
  end
216
232
  end
217
233
 
218
234
  class FieldInfo
219
235
  attr_accessor :name, :number
220
- attr_writer :indexed, :store_term_vector, :store_offset, :store_position
236
+ attr_writer :indexed, :store_term_vector, :store_offset,
237
+ :store_position, :omit_norms
221
238
 
222
239
  def indexed?()
223
240
  return @indexed
@@ -230,23 +247,32 @@ module Ferret
230
247
  def store_offsets?()
231
248
  return @store_offset
232
249
  end
250
+
233
251
  def store_positions?()
234
252
  return @store_position
235
253
  end
236
254
 
237
- def set!(indexed, store_term_vector, store_position, store_offset)
255
+ def omit_norms?()
256
+ return @omit_norms
257
+ end
258
+
259
+ def set!(indexed, store_term_vector, store_position,
260
+ store_offset, omit_norms)
238
261
  @indexed = indexed
239
262
  @store_term_vector = store_term_vector
240
263
  @store_position = store_position
241
264
  @store_offset = store_offset
265
+ @omit_norms = omit_norms
242
266
  end
243
267
 
244
268
  def initialize(name, indexed, number, store_term_vector,
245
269
  store_position = false,
246
- store_offset = false)
270
+ store_offset = false,
271
+ omit_norms = false)
247
272
  @name = name
248
273
  @number = number
249
- set!(indexed, store_term_vector, store_position, store_offset)
274
+ set!(indexed, store_term_vector, store_position,
275
+ store_offset, omit_norms)
250
276
  end
251
277
  end
252
278
  end
@@ -55,10 +55,16 @@ module Ferret::Index
55
55
  end
56
56
  else
57
57
  store = Field::Store::YES
58
- if fi.indexed? and tokenize
59
- index = Field::Index::TOKENIZED
60
- elsif fi.indexed? and not tokenize
61
- index = Field::Index::UNTOKENIZED
58
+ if fi.indexed?
59
+ if tokenize
60
+ index = Field::Index::TOKENIZED
61
+ else
62
+ if fi.omit_norms?
63
+ index = Field::Index::NO_NORMS
64
+ else
65
+ index = Field::Index::UNTOKENIZED
66
+ end
67
+ end
62
68
  else
63
69
  index = Field::Index::NO
64
70
  end
@@ -34,7 +34,8 @@ module Ferret::Index
34
34
  # used when you add a simple string to the index
35
35
  # using #add_document. This will also be used for
36
36
  # default_search_field unless you set it
37
- # explicitly.
37
+ # explicitly. The default for this value is the
38
+ # empty string "".
38
39
  # default_search_field:: This specifies the field or fields that will be
39
40
  # searched by the query parser. You can use a
40
41
  # string to specify one field, eg, "title". Or you
@@ -93,13 +94,20 @@ module Ferret::Index
93
94
  # your query. This defualts to true. If you set it
94
95
  # to false a QueryParseException is raised on a
95
96
  # query parse error.
97
+ # auto_flush:: Set this option to true if you want the index
98
+ # automatically flushed every time you do a write
99
+ # (includes delete) to the index. This is useful if
100
+ # you have multiple processes accessing the index
101
+ # and you don't want lock errors. This is set to
102
+ # false by default.
96
103
  #
97
104
  # Some examples;
98
105
  #
99
106
  # index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
100
107
  #
101
108
  # index = Index::Index.new(:path => '/path/to/index',
102
- # :create_if_missing => false)
109
+ # :create_if_missing => false,
110
+ # :auto_flush => true)
103
111
  #
104
112
  # index = Index::Index.new(:dir => directory,
105
113
  # :close_dir => false
@@ -126,12 +134,15 @@ module Ferret::Index
126
134
 
127
135
  @dir.synchronize do
128
136
  @options = options
129
- @writer = IndexWriter.new(@dir, options)
137
+ @writer = IndexWriter.new(@dir, options) # create the index if need be
130
138
  options[:analyzer] = @analyzer = @writer.analyzer
139
+ @writer.close
140
+ @writer = nil
131
141
  @has_writes = false
132
142
  @reader = nil
133
143
  @options.delete(:create) # only want to create the first time if at all
134
144
  @close_dir = @options.delete(:close_dir) || false # we'll hold this here
145
+ @auto_flush = @options[:auto_flush] || false
135
146
  @default_search_field = (@options[:default_search_field] || \
136
147
  @options[:default_field] || "*")
137
148
  @default_field = @options[:default_field] || ""
@@ -257,6 +268,7 @@ module Ferret::Index
257
268
  ensure_writer_open()
258
269
  @has_writes = true
259
270
  @writer.add_document(fdoc, analyzer || @writer.analyzer)
271
+ flush() if @auto_flush
260
272
  end
261
273
  end
262
274
  alias :<< :add_document
@@ -334,6 +346,7 @@ module Ferret::Index
334
346
  else
335
347
  raise ArgumentError, "Cannot delete for id of type #{id.class}"
336
348
  end
349
+ flush() if @auto_flush
337
350
  end
338
351
  end
339
352
 
@@ -349,6 +362,7 @@ module Ferret::Index
349
362
  @searcher.search_each(query) do |doc, score|
350
363
  @reader.delete(doc)
351
364
  end
365
+ flush() if @auto_flush
352
366
  end
353
367
  end
354
368
 
@@ -393,6 +407,7 @@ module Ferret::Index
393
407
  else
394
408
  raise ArgumentError, "Cannot update for id of type #{id.class}"
395
409
  end
410
+ flush() if @auto_flush
396
411
  end
397
412
  end
398
413
 
@@ -429,6 +444,7 @@ module Ferret::Index
429
444
  docs_to_add.each do |document|
430
445
  @writer.add_document(document)
431
446
  end
447
+ flush() if @auto_flush
432
448
  end
433
449
  end
434
450
 
@@ -532,7 +548,7 @@ module Ferret::Index
532
548
  # false.
533
549
  def persist(directory, create = true)
534
550
  synchronize do
535
- flush
551
+ flush()
536
552
  old_dir = @dir
537
553
  if directory.is_a?(String)
538
554
  @dir = FSDirectory.new(directory, create)
@@ -143,7 +143,7 @@ module Ferret::Index
143
143
  # stored for the specified document.
144
144
  # raises:: IOError if index cannot be accessed
145
145
  #
146
- # See Field.TermVector
146
+ # See Field::TermVector
147
147
  def get_term_vectors(doc_number)
148
148
  raise NotImplementedError
149
149
  end
@@ -161,7 +161,7 @@ module Ferret::Index
161
161
  # returns:: term vector May be nil if field does not exist in the specified
162
162
  # document or term vector was not stored.
163
163
  # raises:: IOError if index cannot be accessed
164
- # See Field.TermVector
164
+ # See Field::TermVector
165
165
  def get_term_vector(doc_number, field)
166
166
  raise NotImplementedError
167
167
  end
@@ -223,12 +223,27 @@ module Ferret::Index
223
223
  def has_deletions?()
224
224
  raise NotImplementedError
225
225
  end
226
-
226
+
227
+ # Returns true if there are norms stored for this field.
228
+ def has_norms?(field)
229
+ # backward compatible implementation.
230
+ # SegmentReader has an efficient implementation.
231
+ return (get_norms(field) != nil)
232
+ end
233
+
227
234
  # Returns the byte-encoded normalization factor for the named field of
228
235
  # every document. This is used by the search code to score documents.
229
236
  #
230
237
  # See Field#boost
231
- def get_norms(field, bytes=nil, offset=nil)
238
+ def get_norms(field)
239
+ raise NotImplementedError
240
+ end
241
+
242
+ # Read norms into a pre-allocated array. This is used as an optimization
243
+ # of get_norms.
244
+ #
245
+ # See Field#boost
246
+ def get_norms_into(field, bytes, offset)
232
247
  raise NotImplementedError
233
248
  end
234
249
 
@@ -455,7 +455,7 @@ module Index
455
455
  write_deleteable_files(deletable) # note files we can't delete
456
456
  # This is a great time to start the garbage collector as all of our
457
457
  # ram files have just become free
458
- GC.start
458
+ #GC.start
459
459
 
460
460
  ##############################################################################
461
461
  # objs = {}
@@ -110,12 +110,20 @@ module Ferret::Index
110
110
  return hi
111
111
  end
112
112
 
113
+ def has_norms?(field)
114
+ @sub_readers.each {|reader| return true if reader.has_norms?(field)}
115
+ return false
116
+ end
117
+
118
+ def fake_norms()
119
+ return @ones ||= SegmentReader.create_fake_norms(max_doc())
120
+ end
121
+
113
122
  def get_norms(field)
114
123
  synchronize do
115
124
  bytes = @norms_cache[field]
116
- if (bytes != nil)
117
- return bytes # cache hit
118
- end
125
+ return bytes if bytes
126
+ return fake_norms if not has_norms?(field)
119
127
 
120
128
  bytes = " " * @max_doc
121
129
  @sub_readers.length.times do |i|
@@ -129,7 +137,9 @@ module Ferret::Index
129
137
  def get_norms_into(field, buf, offset)
130
138
  synchronize do
131
139
  bytes = @norms_cache[field]
132
- if (bytes != nil) # cache hit
140
+ bytes = fake_norms() if (bytes.nil? and not has_norms?(field))
141
+
142
+ if (bytes) # cache hit
133
143
  buf[offset ,@max_doc] = bytes[0, @max_doc]
134
144
  return
135
145
  end
@@ -220,14 +230,14 @@ module Ferret::Index
220
230
  def next?()
221
231
  top = @queue.top()
222
232
  if (top == nil)
223
- @term = nil
233
+ @term_buffer = nil
224
234
  return false
225
235
  end
226
236
 
227
- @term = top.term
237
+ @term = top.term_buffer.term
228
238
  @doc_freq = 0
229
239
 
230
- while top and @term == top.term
240
+ while top and @term == top.term_buffer
231
241
  @queue.pop()
232
242
  @doc_freq += top.term_enum.doc_freq() # increment freq
233
243
  if (top.next?)
@@ -240,6 +250,10 @@ module Ferret::Index
240
250
  return true
241
251
  end
242
252
 
253
+ #def term()
254
+ # @term_buffer.term if @term_buffer
255
+ #end
256
+
243
257
  def close()
244
258
  @queue.close()
245
259
  end