ferret 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/ext/Makefile +2 -2
  2. data/ext/ferret.c +27 -2
  3. data/ext/ferret.h +59 -16
  4. data/ext/ferret_ext.so +0 -0
  5. data/ext/index_io.c +72 -77
  6. data/ext/priority_queue.c +150 -145
  7. data/ext/ram_directory.c +47 -42
  8. data/ext/segment_merge_queue.c +4 -8
  9. data/ext/segment_term_enum.c +324 -0
  10. data/ext/similarity.c +59 -0
  11. data/ext/string_helper.c +2 -2
  12. data/ext/tags +150 -46
  13. data/ext/term.c +107 -152
  14. data/ext/term_buffer.c +105 -174
  15. data/ext/term_infos_reader.c +54 -0
  16. data/ext/terminfo.c +160 -0
  17. data/ext/token.c +93 -0
  18. data/lib/ferret.rb +1 -1
  19. data/lib/ferret/analysis/analyzers.rb +18 -0
  20. data/lib/ferret/analysis/standard_tokenizer.rb +19 -14
  21. data/lib/ferret/analysis/token.rb +8 -1
  22. data/lib/ferret/analysis/tokenizers.rb +10 -5
  23. data/lib/ferret/document/field.rb +33 -11
  24. data/lib/ferret/index/document_writer.rb +3 -2
  25. data/lib/ferret/index/field_infos.rb +38 -12
  26. data/lib/ferret/index/fields_io.rb +10 -4
  27. data/lib/ferret/index/index.rb +20 -4
  28. data/lib/ferret/index/index_reader.rb +19 -4
  29. data/lib/ferret/index/index_writer.rb +1 -1
  30. data/lib/ferret/index/multi_reader.rb +21 -7
  31. data/lib/ferret/index/segment_merge_info.rb +24 -22
  32. data/lib/ferret/index/segment_merge_queue.rb +2 -2
  33. data/lib/ferret/index/segment_merger.rb +28 -11
  34. data/lib/ferret/index/segment_reader.rb +19 -4
  35. data/lib/ferret/index/segment_term_enum.rb +3 -11
  36. data/lib/ferret/index/term_buffer.rb +13 -16
  37. data/lib/ferret/index/term_doc_enum.rb +8 -5
  38. data/lib/ferret/index/term_enum.rb +2 -2
  39. data/lib/ferret/index/term_info.rb +1 -5
  40. data/lib/ferret/index/term_infos_io.rb +2 -0
  41. data/lib/ferret/query_parser/query_parser.tab.rb +7 -7
  42. data/lib/ferret/search/phrase_scorer.rb +0 -1
  43. data/lib/ferret/search/similarity.rb +2 -2
  44. data/lib/ferret/search/term_scorer.rb +2 -2
  45. data/lib/ferret/store/directory.rb +2 -0
  46. data/lib/ferret/store/fs_store.rb +16 -3
  47. data/lib/ferret/store/ram_store.rb +2 -2
  48. data/test/unit/document/tc_field.rb +9 -0
  49. data/test/unit/index/tc_field_infos.rb +29 -21
  50. data/test/unit/index/tc_index.rb +44 -7
  51. data/test/unit/index/tc_term_buffer.rb +3 -3
  52. data/test/unit/index/tc_term_info.rb +1 -1
  53. data/test/unit/query_parser/tc_query_parser.rb +1 -1
  54. data/test/unit/search/tc_index_searcher.rb +3 -0
  55. data/test/unit/store/tc_fs_store.rb +47 -16
  56. data/test/unit/store/tc_ram_store.rb +1 -1
  57. metadata +8 -3
@@ -71,6 +71,12 @@ module Ferret::Document
71
71
  # field string
72
72
  def store_offsets?() return @store_offset end
73
73
 
74
+ # True if the norms are not stored for this field. No norms means that
75
+ # index-time boosting and field length normalization will be disabled.
76
+ # The benefit is less memory usage as norms take up one byte per indexed
77
+ # field for every document in the index.
78
+ def omit_norms?() return @omit_norms end
79
+
74
80
  class Store < Ferret::Utils::Parameter
75
81
  # Store the original field value in the index in a compressed form.
76
82
  # This is useful for long documents and for binary valued fields.
@@ -101,6 +107,13 @@ module Ferret::Document
101
107
  # searched. As no analyzer is used the value will be stored as a
102
108
  # single term. This is useful for unique Ids like product numbers.
103
109
  UNTOKENIZED = Index.new("UNTOKENIZED")
110
+
111
+ # Index the field's value without an Analyzer, and disable the storing
112
+ # of norms. No norms means that index-time boosting and field length
113
+ # normalization will be disabled. The benefit is less memory usage as
114
+ # norms take up one byte per indexed field for every document in the
115
+ # index.
116
+ NO_NORMS = Index.new("NO_NORMS");
104
117
  end
105
118
 
106
119
  class TermVector < Ferret::Utils::Parameter
@@ -174,13 +187,14 @@ module Ferret::Document
174
187
  end
175
188
 
176
189
  def stored=(stored)
177
- if (stored == Store::YES)
190
+ case stored
191
+ when Store::YES
178
192
  @stored = true
179
193
  @compressed = false
180
- elsif (stored == Store::COMPRESS)
194
+ when Store::COMPRESS
181
195
  @stored = true
182
196
  @compressed = true
183
- elsif (stored == Store::NO)
197
+ when Store::NO
184
198
  @stored = false
185
199
  @compressed = false
186
200
  else
@@ -189,38 +203,45 @@ module Ferret::Document
189
203
  end
190
204
 
191
205
  def index=(index)
192
- if (index == Index::NO)
206
+ @omit_norms = false
207
+ case index
208
+ when Index::NO
193
209
  @indexed = false
194
210
  @tokenized = false
195
- elsif (index == Index::TOKENIZED)
211
+ when Index::TOKENIZED
196
212
  @indexed = true
197
213
  @tokenized = true
198
- elsif (index == Index::UNTOKENIZED)
214
+ when Index::UNTOKENIZED
215
+ @indexed = true
216
+ @tokenized = false
217
+ when Index::NO_NORMS
199
218
  @indexed = true
200
219
  @tokenized = false
220
+ @omit_norms = true
201
221
  else
202
222
  raise "unknown stored parameter " + index.to_s
203
223
  end
204
224
  end
205
225
 
206
226
  def store_term_vector=(store_term_vector)
207
- if (store_term_vector == TermVector::NO)
227
+ case store_term_vector
228
+ when TermVector::NO
208
229
  @store_term_vector = false
209
230
  @store_position = false
210
231
  @store_offset = false
211
- elsif (store_term_vector == TermVector::YES)
232
+ when TermVector::YES
212
233
  @store_term_vector = true
213
234
  @store_position = false
214
235
  @store_offset = false
215
- elsif (store_term_vector == TermVector::WITH_POSITIONS)
236
+ when TermVector::WITH_POSITIONS
216
237
  @store_term_vector = true
217
238
  @store_position = true
218
239
  @store_offset = false
219
- elsif (store_term_vector == TermVector::WITH_OFFSETS)
240
+ when TermVector::WITH_OFFSETS
220
241
  @store_term_vector = true
221
242
  @store_position = false
222
243
  @store_offset = true
223
- elsif (store_term_vector == TermVector::WITH_POSITIONS_OFFSETS)
244
+ when TermVector::WITH_POSITIONS_OFFSETS
224
245
  @store_term_vector = true
225
246
  @store_position = true
226
247
  @store_offset = true
@@ -284,6 +305,7 @@ module Ferret::Document
284
305
  str << "store_term_vector," if (@store_term_vector)
285
306
  str << "tv_offset," if (@store_offset)
286
307
  str << "tv_position," if (@store_position)
308
+ str << "omit_norms," if (@omit_norms)
287
309
  str << "binary," if (@binary)
288
310
  str << "<#{@name}:#{data}>"
289
311
  end
@@ -92,9 +92,10 @@ module Ferret::Index
92
92
 
93
93
  length = @field_lengths[field_number] # length of field
94
94
  position = @field_positions[field_number] # position in field
95
+ position += @analyzer.position_increment_gap(field_name) if length > 0
95
96
  offset = @field_offsets[field_number] # offset field
96
97
 
97
- if field_info.indexed?
98
+ if field_info.indexed?
98
99
  if not field.tokenized? # un-tokenized field
99
100
  string_value = field.string_value
100
101
  if field_info.store_offsets?
@@ -261,7 +262,7 @@ module Ferret::Index
261
262
 
262
263
  def write_norms(segment)
263
264
  @field_infos.each_with_index do |fi, i|
264
- if fi.indexed?
265
+ if fi.indexed? and not fi.omit_norms?
265
266
  norm = @field_boosts[i] * @similarity.length_norm(fi.name, @field_lengths[i])
266
267
  norms = @directory.create_output(segment + ".f" + i.to_s)
267
268
  begin
@@ -35,7 +35,8 @@ module Ferret
35
35
  field.indexed?,
36
36
  field.store_term_vector?,
37
37
  field.store_positions?,
38
- field.store_offsets?)
38
+ field.store_offsets?,
39
+ field.omit_norms?)
39
40
  end
40
41
  end
41
42
  alias :<< :add_doc_fields
@@ -45,9 +46,11 @@ module Ferret
45
46
  indexed = true,
46
47
  store_term_vector = false,
47
48
  store_position = false,
48
- store_offset = false)
49
+ store_offset = false,
50
+ omit_norms = false)
49
51
  names.each do |name|
50
- add(name, indexed, store_term_vector, store_position, store_offset)
52
+ add(name, indexed, store_term_vector, store_position,
53
+ store_offset, omit_norms)
51
54
  end
52
55
  end
53
56
 
@@ -65,10 +68,12 @@ module Ferret
65
68
  indexed = true,
66
69
  store_term_vector = false,
67
70
  store_position = false,
68
- store_offset = false)
71
+ store_offset = false,
72
+ omit_norms = false)
69
73
  fi = @fi_hash[name]
70
74
  if (fi == nil)
71
- fi = add_internal(name, indexed, store_term_vector, store_position, store_offset)
75
+ fi = add_internal(name, indexed, store_term_vector, store_position,
76
+ store_offset, omit_norms)
72
77
  else
73
78
  if (fi.indexed? != indexed)
74
79
  fi.indexed = true # once indexed, always index
@@ -82,6 +87,9 @@ module Ferret
82
87
  if (fi.store_offsets? != store_offset)
83
88
  fi.store_offset = true # once vector, always vector
84
89
  end
90
+ if (fi.omit_norms? != omit_norms)
91
+ fi.omit_norms = false # once norms are stored, always store norms
92
+ end
85
93
  end
86
94
  return fi
87
95
  end
@@ -174,7 +182,9 @@ module Ferret
174
182
  store_term_vector = (bits & STORE_TERM_VECTOR) != 0
175
183
  store_position = (bits & STORE_POSITION) != 0
176
184
  store_offset = (bits & STORE_OFFSET) != 0
177
- add_internal(name, indexed, store_term_vector, store_position, store_offset)
185
+ omit_norms = (bits & OMIT_NORMS) != 0
186
+ add_internal(name, indexed, store_term_vector, store_position,
187
+ store_offset, omit_norms)
178
188
  end
179
189
  end
180
190
 
@@ -183,15 +193,18 @@ module Ferret
183
193
  STORE_TERM_VECTOR = 0x2;
184
194
  STORE_POSITION = 0x4;
185
195
  STORE_OFFSET = 0x8;
196
+ OMIT_NORMS = 0x10;
186
197
 
187
198
  def add_internal(name, indexed, store_term_vector,
188
199
  store_position = false,
189
- store_offset = false)
200
+ store_offset = false,
201
+ omit_norms = false)
190
202
  fi = FieldInfo.new(name, indexed,
191
203
  @fi_array.size(),
192
204
  store_term_vector,
193
205
  store_position,
194
- store_offset)
206
+ store_offset,
207
+ omit_norms)
195
208
  @fi_array << fi
196
209
  @fi_hash[name] = fi
197
210
  return fi
@@ -211,13 +224,17 @@ module Ferret
211
224
  if (fi.store_offsets?)
212
225
  bits |= STORE_OFFSET
213
226
  end
227
+ if (fi.omit_norms?)
228
+ bits |= OMIT_NORMS
229
+ end
214
230
  return bits
215
231
  end
216
232
  end
217
233
 
218
234
  class FieldInfo
219
235
  attr_accessor :name, :number
220
- attr_writer :indexed, :store_term_vector, :store_offset, :store_position
236
+ attr_writer :indexed, :store_term_vector, :store_offset,
237
+ :store_position, :omit_norms
221
238
 
222
239
  def indexed?()
223
240
  return @indexed
@@ -230,23 +247,32 @@ module Ferret
230
247
  def store_offsets?()
231
248
  return @store_offset
232
249
  end
250
+
233
251
  def store_positions?()
234
252
  return @store_position
235
253
  end
236
254
 
237
- def set!(indexed, store_term_vector, store_position, store_offset)
255
+ def omit_norms?()
256
+ return @omit_norms
257
+ end
258
+
259
+ def set!(indexed, store_term_vector, store_position,
260
+ store_offset, omit_norms)
238
261
  @indexed = indexed
239
262
  @store_term_vector = store_term_vector
240
263
  @store_position = store_position
241
264
  @store_offset = store_offset
265
+ @omit_norms = omit_norms
242
266
  end
243
267
 
244
268
  def initialize(name, indexed, number, store_term_vector,
245
269
  store_position = false,
246
- store_offset = false)
270
+ store_offset = false,
271
+ omit_norms = false)
247
272
  @name = name
248
273
  @number = number
249
- set!(indexed, store_term_vector, store_position, store_offset)
274
+ set!(indexed, store_term_vector, store_position,
275
+ store_offset, omit_norms)
250
276
  end
251
277
  end
252
278
  end
@@ -55,10 +55,16 @@ module Ferret::Index
55
55
  end
56
56
  else
57
57
  store = Field::Store::YES
58
- if fi.indexed? and tokenize
59
- index = Field::Index::TOKENIZED
60
- elsif fi.indexed? and not tokenize
61
- index = Field::Index::UNTOKENIZED
58
+ if fi.indexed?
59
+ if tokenize
60
+ index = Field::Index::TOKENIZED
61
+ else
62
+ if fi.omit_norms?
63
+ index = Field::Index::NO_NORMS
64
+ else
65
+ index = Field::Index::UNTOKENIZED
66
+ end
67
+ end
62
68
  else
63
69
  index = Field::Index::NO
64
70
  end
@@ -34,7 +34,8 @@ module Ferret::Index
34
34
  # used when you add a simple string to the index
35
35
  # using #add_document. This will also be used for
36
36
  # default_search_field unless you set it
37
- # explicitly.
37
+ # explicitly. The default for this value is the
38
+ # empty string "".
38
39
  # default_search_field:: This specifies the field or fields that will be
39
40
  # searched by the query parser. You can use a
40
41
  # string to specify one field, eg, "title". Or you
@@ -93,13 +94,20 @@ module Ferret::Index
93
94
  # your query. This defualts to true. If you set it
94
95
  # to false a QueryParseException is raised on a
95
96
  # query parse error.
97
+ # auto_flush:: Set this option to true if you want the index
98
+ # automatically flushed every time you do a write
99
+ # (includes delete) to the index. This is useful if
100
+ # you have multiple processes accessing the index
101
+ # and you don't want lock errors. This is set to
102
+ # false by default.
96
103
  #
97
104
  # Some examples;
98
105
  #
99
106
  # index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
100
107
  #
101
108
  # index = Index::Index.new(:path => '/path/to/index',
102
- # :create_if_missing => false)
109
+ # :create_if_missing => false,
110
+ # :auto_flush => true)
103
111
  #
104
112
  # index = Index::Index.new(:dir => directory,
105
113
  # :close_dir => false
@@ -126,12 +134,15 @@ module Ferret::Index
126
134
 
127
135
  @dir.synchronize do
128
136
  @options = options
129
- @writer = IndexWriter.new(@dir, options)
137
+ @writer = IndexWriter.new(@dir, options) # create the index if need be
130
138
  options[:analyzer] = @analyzer = @writer.analyzer
139
+ @writer.close
140
+ @writer = nil
131
141
  @has_writes = false
132
142
  @reader = nil
133
143
  @options.delete(:create) # only want to create the first time if at all
134
144
  @close_dir = @options.delete(:close_dir) || false # we'll hold this here
145
+ @auto_flush = @options[:auto_flush] || false
135
146
  @default_search_field = (@options[:default_search_field] || \
136
147
  @options[:default_field] || "*")
137
148
  @default_field = @options[:default_field] || ""
@@ -257,6 +268,7 @@ module Ferret::Index
257
268
  ensure_writer_open()
258
269
  @has_writes = true
259
270
  @writer.add_document(fdoc, analyzer || @writer.analyzer)
271
+ flush() if @auto_flush
260
272
  end
261
273
  end
262
274
  alias :<< :add_document
@@ -334,6 +346,7 @@ module Ferret::Index
334
346
  else
335
347
  raise ArgumentError, "Cannot delete for id of type #{id.class}"
336
348
  end
349
+ flush() if @auto_flush
337
350
  end
338
351
  end
339
352
 
@@ -349,6 +362,7 @@ module Ferret::Index
349
362
  @searcher.search_each(query) do |doc, score|
350
363
  @reader.delete(doc)
351
364
  end
365
+ flush() if @auto_flush
352
366
  end
353
367
  end
354
368
 
@@ -393,6 +407,7 @@ module Ferret::Index
393
407
  else
394
408
  raise ArgumentError, "Cannot update for id of type #{id.class}"
395
409
  end
410
+ flush() if @auto_flush
396
411
  end
397
412
  end
398
413
 
@@ -429,6 +444,7 @@ module Ferret::Index
429
444
  docs_to_add.each do |document|
430
445
  @writer.add_document(document)
431
446
  end
447
+ flush() if @auto_flush
432
448
  end
433
449
  end
434
450
 
@@ -532,7 +548,7 @@ module Ferret::Index
532
548
  # false.
533
549
  def persist(directory, create = true)
534
550
  synchronize do
535
- flush
551
+ flush()
536
552
  old_dir = @dir
537
553
  if directory.is_a?(String)
538
554
  @dir = FSDirectory.new(directory, create)
@@ -143,7 +143,7 @@ module Ferret::Index
143
143
  # stored for the specified document.
144
144
  # raises:: IOError if index cannot be accessed
145
145
  #
146
- # See Field.TermVector
146
+ # See Field::TermVector
147
147
  def get_term_vectors(doc_number)
148
148
  raise NotImplementedError
149
149
  end
@@ -161,7 +161,7 @@ module Ferret::Index
161
161
  # returns:: term vector May be nil if field does not exist in the specified
162
162
  # document or term vector was not stored.
163
163
  # raises:: IOError if index cannot be accessed
164
- # See Field.TermVector
164
+ # See Field::TermVector
165
165
  def get_term_vector(doc_number, field)
166
166
  raise NotImplementedError
167
167
  end
@@ -223,12 +223,27 @@ module Ferret::Index
223
223
  def has_deletions?()
224
224
  raise NotImplementedError
225
225
  end
226
-
226
+
227
+ # Returns true if there are norms stored for this field.
228
+ def has_norms?(field)
229
+ # backward compatible implementation.
230
+ # SegmentReader has an efficient implementation.
231
+ return (get_norms(field) != nil)
232
+ end
233
+
227
234
  # Returns the byte-encoded normalization factor for the named field of
228
235
  # every document. This is used by the search code to score documents.
229
236
  #
230
237
  # See Field#boost
231
- def get_norms(field, bytes=nil, offset=nil)
238
+ def get_norms(field)
239
+ raise NotImplementedError
240
+ end
241
+
242
+ # Read norms into a pre-allocated array. This is used as an optimization
243
+ # of get_norms.
244
+ #
245
+ # See Field#boost
246
+ def get_norms_into(field, bytes, offset)
232
247
  raise NotImplementedError
233
248
  end
234
249
 
@@ -455,7 +455,7 @@ module Index
455
455
  write_deleteable_files(deletable) # note files we can't delete
456
456
  # This is a great time to start the garbage collector as all of our
457
457
  # ram files have just become free
458
- GC.start
458
+ #GC.start
459
459
 
460
460
  ##############################################################################
461
461
  # objs = {}
@@ -110,12 +110,20 @@ module Ferret::Index
110
110
  return hi
111
111
  end
112
112
 
113
+ def has_norms?(field)
114
+ @sub_readers.each {|reader| return true if reader.has_norms?(field)}
115
+ return false
116
+ end
117
+
118
+ def fake_norms()
119
+ return @ones ||= SegmentReader.create_fake_norms(max_doc())
120
+ end
121
+
113
122
  def get_norms(field)
114
123
  synchronize do
115
124
  bytes = @norms_cache[field]
116
- if (bytes != nil)
117
- return bytes # cache hit
118
- end
125
+ return bytes if bytes
126
+ return fake_norms if not has_norms?(field)
119
127
 
120
128
  bytes = " " * @max_doc
121
129
  @sub_readers.length.times do |i|
@@ -129,7 +137,9 @@ module Ferret::Index
129
137
  def get_norms_into(field, buf, offset)
130
138
  synchronize do
131
139
  bytes = @norms_cache[field]
132
- if (bytes != nil) # cache hit
140
+ bytes = fake_norms() if (bytes.nil? and not has_norms?(field))
141
+
142
+ if (bytes) # cache hit
133
143
  buf[offset ,@max_doc] = bytes[0, @max_doc]
134
144
  return
135
145
  end
@@ -220,14 +230,14 @@ module Ferret::Index
220
230
  def next?()
221
231
  top = @queue.top()
222
232
  if (top == nil)
223
- @term = nil
233
+ @term_buffer = nil
224
234
  return false
225
235
  end
226
236
 
227
- @term = top.term
237
+ @term = top.term_buffer.term
228
238
  @doc_freq = 0
229
239
 
230
- while top and @term == top.term
240
+ while top and @term == top.term_buffer
231
241
  @queue.pop()
232
242
  @doc_freq += top.term_enum.doc_freq() # increment freq
233
243
  if (top.next?)
@@ -240,6 +250,10 @@ module Ferret::Index
240
250
  return true
241
251
  end
242
252
 
253
+ #def term()
254
+ # @term_buffer.term if @term_buffer
255
+ #end
256
+
243
257
  def close()
244
258
  @queue.close()
245
259
  end