ferret 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/ext/Makefile +2 -2
  2. data/ext/ferret.c +27 -2
  3. data/ext/ferret.h +59 -16
  4. data/ext/ferret_ext.so +0 -0
  5. data/ext/index_io.c +72 -77
  6. data/ext/priority_queue.c +150 -145
  7. data/ext/ram_directory.c +47 -42
  8. data/ext/segment_merge_queue.c +4 -8
  9. data/ext/segment_term_enum.c +324 -0
  10. data/ext/similarity.c +59 -0
  11. data/ext/string_helper.c +2 -2
  12. data/ext/tags +150 -46
  13. data/ext/term.c +107 -152
  14. data/ext/term_buffer.c +105 -174
  15. data/ext/term_infos_reader.c +54 -0
  16. data/ext/terminfo.c +160 -0
  17. data/ext/token.c +93 -0
  18. data/lib/ferret.rb +1 -1
  19. data/lib/ferret/analysis/analyzers.rb +18 -0
  20. data/lib/ferret/analysis/standard_tokenizer.rb +19 -14
  21. data/lib/ferret/analysis/token.rb +8 -1
  22. data/lib/ferret/analysis/tokenizers.rb +10 -5
  23. data/lib/ferret/document/field.rb +33 -11
  24. data/lib/ferret/index/document_writer.rb +3 -2
  25. data/lib/ferret/index/field_infos.rb +38 -12
  26. data/lib/ferret/index/fields_io.rb +10 -4
  27. data/lib/ferret/index/index.rb +20 -4
  28. data/lib/ferret/index/index_reader.rb +19 -4
  29. data/lib/ferret/index/index_writer.rb +1 -1
  30. data/lib/ferret/index/multi_reader.rb +21 -7
  31. data/lib/ferret/index/segment_merge_info.rb +24 -22
  32. data/lib/ferret/index/segment_merge_queue.rb +2 -2
  33. data/lib/ferret/index/segment_merger.rb +28 -11
  34. data/lib/ferret/index/segment_reader.rb +19 -4
  35. data/lib/ferret/index/segment_term_enum.rb +3 -11
  36. data/lib/ferret/index/term_buffer.rb +13 -16
  37. data/lib/ferret/index/term_doc_enum.rb +8 -5
  38. data/lib/ferret/index/term_enum.rb +2 -2
  39. data/lib/ferret/index/term_info.rb +1 -5
  40. data/lib/ferret/index/term_infos_io.rb +2 -0
  41. data/lib/ferret/query_parser/query_parser.tab.rb +7 -7
  42. data/lib/ferret/search/phrase_scorer.rb +0 -1
  43. data/lib/ferret/search/similarity.rb +2 -2
  44. data/lib/ferret/search/term_scorer.rb +2 -2
  45. data/lib/ferret/store/directory.rb +2 -0
  46. data/lib/ferret/store/fs_store.rb +16 -3
  47. data/lib/ferret/store/ram_store.rb +2 -2
  48. data/test/unit/document/tc_field.rb +9 -0
  49. data/test/unit/index/tc_field_infos.rb +29 -21
  50. data/test/unit/index/tc_index.rb +44 -7
  51. data/test/unit/index/tc_term_buffer.rb +3 -3
  52. data/test/unit/index/tc_term_info.rb +1 -1
  53. data/test/unit/query_parser/tc_query_parser.rb +1 -1
  54. data/test/unit/search/tc_index_searcher.rb +3 -0
  55. data/test/unit/store/tc_fs_store.rb +47 -16
  56. data/test/unit/store/tc_ram_store.rb +1 -1
  57. metadata +8 -3
@@ -1,44 +1,46 @@
1
1
  module Ferret
2
2
  module Index
3
3
  class SegmentMergeInfo
4
- attr_reader :term, :term_enum, :reader, :postings, :doc_map, :base
4
+ attr_reader :term_enum, :reader, :base, :term_buffer
5
5
 
6
6
  def initialize(base, term_enum, reader)
7
7
  @base = base
8
8
  @reader = reader
9
9
  @term_enum = term_enum
10
- @term = term_enum.term()
11
- @postings = @reader.term_positions()
10
+ @term_buffer = term_enum.term_buffer
11
+ end
12
+
13
+ def positions
14
+ @postings ||= @reader.term_positions()
15
+ end
12
16
 
13
- # build array which maps document numbers around deletions
14
- if (@reader.has_deletions?())
15
- max_doc = @reader.max_doc()
16
- @doc_map = Array.new(max_doc)
17
- j = 0
18
- max_doc.times do |i|
19
- if (@reader.deleted?(i))
20
- @doc_map[i] = -1
21
- else
22
- @doc_map[i] = j
23
- j += 1
17
+ def doc_map
18
+ if @doc_map.nil?
19
+ # build array which maps document numbers around deletions
20
+ if (@reader.has_deletions?())
21
+ max_doc = @reader.max_doc()
22
+ @doc_map = Array.new(max_doc)
23
+ j = 0
24
+ max_doc.times do |i|
25
+ if (@reader.deleted?(i))
26
+ @doc_map[i] = -1
27
+ else
28
+ @doc_map[i] = j
29
+ j += 1
30
+ end
24
31
  end
25
32
  end
26
33
  end
34
+ return @doc_map
27
35
  end
28
36
 
29
37
  def next?
30
- if @term_enum.next?
31
- @term = @term_enum.term
32
- return true
33
- else
34
- @term = nil
35
- return false
36
- end
38
+ @term_enum.next?
37
39
  end
38
40
 
39
41
  def close()
40
42
  @term_enum.close()
41
- @postings.close()
43
+ @postings.close() if @postings
42
44
  @reader = nil
43
45
  end
44
46
  end
@@ -1,10 +1,10 @@
1
1
  module Ferret::Index
2
2
  class SegmentMergeQueue < Ferret::Utils::PriorityQueue
3
3
  def less_than(sti_a, sti_b)
4
- if sti_a.term == sti_b.term
4
+ if sti_a.term_buffer == sti_b.term_buffer
5
5
  return sti_a.base < sti_b.base
6
6
  else
7
- return sti_a.term < sti_b.term
7
+ return sti_a.term_buffer < sti_b.term_buffer
8
8
  end
9
9
  end
10
10
 
@@ -71,7 +71,7 @@ module Ferret::Index
71
71
 
72
72
  # Field norm files
73
73
  @field_infos.each_with_index do |fi, i|
74
- if (fi.indexed?)
74
+ if (fi.indexed? and not fi.omit_norms?)
75
75
  files << "#{@segment}.f#{i}"
76
76
  end
77
77
  end
@@ -94,6 +94,21 @@ module Ferret::Index
94
94
  return files
95
95
  end
96
96
 
97
+ def add_indexed(reader, field_infos, field_names,
98
+ store_term_vectors,
99
+ store_position_with_term_vector,
100
+ store_offset_with_term_vector)
101
+ field_names.each do |field|
102
+ field_infos.add(field, true,
103
+ store_term_vectors,
104
+ store_position_with_term_vector,
105
+ store_offset_with_term_vector,
106
+ !reader.has_norms?(field))
107
+ end
108
+ end
109
+ private :add_indexed
110
+
111
+
97
112
  #
98
113
  # returns:: The number of documents in all of the readers
99
114
  # raises:: IOError
@@ -101,11 +116,11 @@ module Ferret::Index
101
116
  @field_infos = FieldInfos.new() # merge field names
102
117
  doc_count = 0
103
118
  @readers.each do |reader|
104
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET), true, true, true, true)
105
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION), true, true, true, false)
106
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET), true, true, false, true)
107
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR), true, true, false, false)
108
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::INDEXED), true, false, false, false)
119
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET), true, true, true)
120
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION), true, true, false)
121
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET), true, false, true)
122
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR), true, false, false)
123
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::INDEXED), false, false, false)
109
124
  @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::UNINDEXED), false)
110
125
  end
111
126
  @field_infos.write_to_dir(@directory, @segment + ".fnm")
@@ -186,10 +201,12 @@ module Ferret::Index
186
201
  match_size = 0 # pop matching terms
187
202
  match[match_size] = @queue.pop
188
203
  match_size += 1
189
- term = match[0].term
204
+ #term = match[0].term
205
+ term_buffer = match[0].term_buffer
190
206
  top = @queue.top
191
207
 
192
- while top and term == top.term
208
+ #while top and term == top.term
209
+ while top and term_buffer == top.term_buffer
193
210
  match[match_size] = @queue.pop
194
211
  match_size += 1
195
212
  top = @queue.top
@@ -227,7 +244,7 @@ module Ferret::Index
227
244
  if (df > 0)
228
245
  # add an enbegin to the dictionary with pointers to prox and freq files
229
246
  @term_info.set_values!(df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer))
230
- @term_infos_writer.add(smis[0].term, @term_info)
247
+ @term_infos_writer.add(smis[0].term_buffer.term, @term_info)
231
248
  end
232
249
  end
233
250
 
@@ -244,7 +261,7 @@ module Ferret::Index
244
261
  reset_skip()
245
262
  n.times do |i|
246
263
  smi = smis[i]
247
- postings = smi.postings
264
+ postings = smi.positions
248
265
  base = smi.base
249
266
  doc_map = smi.doc_map
250
267
 
@@ -315,7 +332,7 @@ module Ferret::Index
315
332
 
316
333
  def merge_norms()
317
334
  @field_infos.each_with_index do |fi, i|
318
- if (fi.indexed?)
335
+ if (fi.indexed? and not fi.omit_norms?)
319
336
  output = @directory.create_output(@segment + ".f" + i.to_s)
320
337
  begin
321
338
  @readers.each do |reader|
@@ -127,7 +127,7 @@ module Ferret::Index
127
127
  end
128
128
 
129
129
  @field_infos.each_with_index do |fi, i|
130
- if (fi.indexed?)
130
+ if (fi.indexed? and not fi.omit_norms?)
131
131
  if @cfs_reader.nil?
132
132
  name = "#{@segment}.f#{i}"
133
133
  else
@@ -228,10 +228,22 @@ module Ferret::Index
228
228
  return field_set
229
229
  end
230
230
 
231
+ def has_norms?(field)
232
+ return @norms.has_key?(field)
233
+ end
234
+
235
+ def SegmentReader.create_fake_norms(size)
236
+ Array.new(size, 1).pack("C*")
237
+ end
238
+
239
+ def fake_norms()
240
+ return @ones ||= SegmentReader.create_fake_norms(max_doc())
241
+ end
242
+
231
243
  def get_norms(field)
232
244
  synchronize do
233
245
  norm = @norms[field]
234
- if (norm == nil) # not an indexed field
246
+ if (norm == nil) # not an indexed field or omit norms
235
247
  return nil
236
248
  end
237
249
  if (norm.bytes == nil) # value not yet read
@@ -258,7 +270,10 @@ module Ferret::Index
258
270
  def get_norms_into(field, bytes, offset)
259
271
  synchronize do
260
272
  norm = @norms[field]
261
- return if (norm == nil) # use zeros in array
273
+ if (norm.nil?)
274
+ bytes[offset, max_doc()] = fake_norms[0, max_doc()]
275
+ return
276
+ end
262
277
 
263
278
  if (norm.bytes != nil) # can copy from cache
264
279
  bytes[offset, max_doc()] = norm.bytes[0, max_doc()]
@@ -277,7 +292,7 @@ module Ferret::Index
277
292
 
278
293
  def open_norms(cfs_dir)
279
294
  @field_infos.each do |fi|
280
- if (fi.indexed?)
295
+ if (fi.indexed? and not fi.omit_norms?)
281
296
  # look first if there are separate norms in compound format
282
297
  file_name = @segment + ".s" + fi.number.to_s
283
298
  d = @directory
@@ -15,11 +15,9 @@ module Ferret::Index
15
15
 
16
16
  @term_buffer = TermBuffer.new()
17
17
  @prev_buffer = TermBuffer.new()
18
- @scratch = nil # used for scanning
19
18
  @term_info = TermInfo.new()
20
19
 
21
20
  @index_pointer = 0
22
- @format_m1skip_interval = nil
23
21
 
24
22
  first_int = @input.read_int()
25
23
 
@@ -61,8 +59,7 @@ module Ferret::Index
61
59
 
62
60
  #attr_accessors for the clone method
63
61
  attr_accessor :input, :term_buffer, :prev_buffer
64
- protected :input, :input=, :term_buffer,
65
- :term_buffer=, :prev_buffer, :prev_buffer=
62
+ protected :input, :input=, :prev_buffer, :prev_buffer=
66
63
 
67
64
  def initialize_copy(o)
68
65
  super
@@ -83,7 +80,7 @@ module Ferret::Index
83
80
  # Increments the enumeration to the next element. True if one exists.
84
81
  def next?
85
82
  @position += 1
86
- if (@position > @size - 1)
83
+ if (@position >= @size)
87
84
  @term_buffer.reset()
88
85
  return false
89
86
  end
@@ -117,13 +114,8 @@ module Ferret::Index
117
114
  return true
118
115
  end
119
116
 
120
- # Optimized scan, without allocating new terms.
121
117
  def scan_to(term)
122
- if (@scratch == nil)
123
- @scratch = TermBuffer.new()
124
- end
125
- @scratch.term = term
126
- while (@scratch > @term_buffer and next?) do
118
+ while (term > @term_buffer and next?) do
127
119
  end
128
120
  end
129
121
 
@@ -2,21 +2,21 @@ module Ferret::Index
2
2
  class TermBuffer
3
3
  include Comparable
4
4
 
5
- attr_reader :text, :text_length, :field
5
+ attr_reader :text_buf, :text_length, :field
6
6
 
7
7
  def initialize
8
- @text = String.new
8
+ @text_buf = String.new
9
9
  @text_length = -1
10
10
  @field = nil
11
11
  end
12
12
 
13
13
  def hash()
14
- return @text.hash + @field.hash
14
+ return text.hash + @field.hash
15
15
  end
16
16
 
17
17
  def <=>(other)
18
18
  if (@field == other.field)
19
- return text_str <=> other.text_str
19
+ return text <=> other.text
20
20
  end
21
21
  @field <=> other.field
22
22
  end
@@ -27,7 +27,7 @@ module Ferret::Index
27
27
  length = input.read_vint()
28
28
  total_length = start + length
29
29
  @text_length = total_length
30
- input.read_chars(@text, start, length)
30
+ input.read_chars(@text_buf, start, length)
31
31
  @field = field_infos[input.read_vint()].name
32
32
  end
33
33
 
@@ -38,8 +38,8 @@ module Ferret::Index
38
38
  end
39
39
 
40
40
  # copy text into the buffer
41
- @text_length = term.text.length
42
- @text = term.text.clone
41
+ @text_buf = term.text.clone
42
+ @text_length = @text_buf.length
43
43
 
44
44
  @field = term.field
45
45
  @term = term
@@ -47,14 +47,15 @@ module Ferret::Index
47
47
 
48
48
  def set!(other)
49
49
  @text_length = other.text_length
50
- @text = other.text.clone if other.text
50
+ @text_buf = other.text_buf.clone if other.text_buf
51
51
  @field = other.field
52
52
  @term = other.term
53
53
  end
54
+ alias :initialize_copy :set!
54
55
 
55
56
  def reset()
56
57
  @field = nil
57
- @text = String.new
58
+ @text_buf = ""
58
59
  @text_length = 0
59
60
  @term = nil
60
61
  end
@@ -65,18 +66,14 @@ module Ferret::Index
65
66
  end
66
67
 
67
68
  if @term.nil?
68
- @term = Term.new(@field, @text[0,@text_length].to_s)
69
+ @term = Term.new(@field, @text_buf[0,@text_length].to_s)
69
70
  end
70
71
  return @term
71
72
  end
72
73
  alias :term :to_term
73
74
 
74
- def initialize_copy(o)
75
- set!(o)
76
- end
77
-
78
- def text_str()
79
- @text[0,@text_length]
75
+ def text()
76
+ @text_buf[0,@text_length]
80
77
  end
81
78
 
82
79
  def to_s()
@@ -73,13 +73,16 @@ module Ferret::Index
73
73
  if t.instance_of?(Term)
74
74
  ti = parent.term_infos[t]
75
75
  elsif t.is_a?(TermEnum)
76
+ ti = t.term_info()
77
+ # The following is being done in the Java version. I don't think it's
78
+ # necessary.
76
79
  # use comparison of fieldinfos to verify that term enum (t) belongs to the
77
80
  # same segment as this SegmentTermDocEnum
78
- if (t.instance_of?(SegmentTermEnum) and t.field_infos == parent.field_infos)
79
- ti = t.term_info()
80
- else # punt case
81
- ti = parent.term_infos[t.term]
82
- end
81
+ #if (t.instance_of?(SegmentTermEnum) and t.field_infos == parent.field_infos)
82
+ # ti = t.term_info()
83
+ #else # punt case
84
+ # ti = parent.term_infos[t.term]
85
+ #end
83
86
  elsif t.is_a? TermInfo # this one is easy. That's exactly what we're looking for
84
87
  ti = t
85
88
  else
@@ -33,7 +33,7 @@ module Ferret
33
33
  #
34
34
  # Behaves as if written:
35
35
  #
36
- # def skip_to(target_term)
36
+ # def skip_to(target)
37
37
  # while (target > term)
38
38
  # if (!next()) return false
39
39
  # end
@@ -41,7 +41,7 @@ module Ferret
41
41
  # end
42
42
  #
43
43
  # Some implementations are considerably more efficient than that.
44
- def skip_to(term)
44
+ def skip_to(target)
45
45
  while (target > term)
46
46
  return false if not next?
47
47
  end
@@ -21,10 +21,6 @@ module Ferret::Index
21
21
  @skip_offset = so
22
22
  end
23
23
 
24
- def copy_of()
25
- TermInfo.new(doc_freq, freq_pointer, prox_pointer, skip_offset)
26
- end
27
-
28
24
  def ==(o)
29
25
  return false if !o.instance_of?(TermInfo)
30
26
  @doc_freq == o.doc_freq &&
@@ -35,7 +31,7 @@ module Ferret::Index
35
31
  alias eql? ==
36
32
 
37
33
  def to_s()
38
- "TermInfo:df=#{@doc_freq}:fp=#{@freq_pointer}:pp=#{@prox_pointer}:so=#{@skip_offset}"
34
+ "TermInfo:df=#{doc_freq}:fp=#{freq_pointer}:pp=#{prox_pointer}:so=#{skip_offset}"
39
35
  end
40
36
  end
41
37
  end
@@ -252,6 +252,8 @@ module Ferret::Index
252
252
 
253
253
  # Returns the offset of the greatest index entry which is less than or
254
254
  # equal to term.
255
+ #
256
+ # This method is rewritten in the C extension.
255
257
  def get_index_offset(term)
256
258
  lo = 0 # binary search @index_terms[]
257
259
  hi = @index_terms.length - 1
@@ -11,7 +11,7 @@ module Ferret
11
11
 
12
12
  class QueryParser < Racc::Parser
13
13
 
14
- module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492', 'lib/ferret/query_parser/query_parser.y', 126
14
+ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id6e7f6ac20b', 'lib/ferret/query_parser/query_parser.y', 126
15
15
  attr_accessor :default_field, :fields, :handle_parse_errors
16
16
 
17
17
  def initialize(default_field = "*", options = {})
@@ -53,11 +53,11 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
53
53
  case str
54
54
  when /\A\s+/
55
55
  ;
56
- when /\A[#{ECHR}]/
56
+ when /\A([#{EWCHR}]|[*?](?=:))/
57
57
  @q.push [ RESERVED[$&]||$&, $& ]
58
58
  when /\A(\&\&|\|\|)/
59
59
  @q.push [ RESERVED[$&], $& ]
60
- when /\A(\\[#{ECHR}]|[^\s#{ECHR}])+[?*](\\[#{EWCHR}]|[^\s#{EWCHR}])*/
60
+ when /\A(\\[#{ECHR}]|[^\s#{ECHR}])*[?*](\\[#{EWCHR}]|[^\s#{EWCHR}])*/
61
61
  str = $'
62
62
  unescaped = $&.gsub(/\\(?!\\)/,"")
63
63
  @q.push [ :WILD_STRING, unescaped ]
@@ -82,8 +82,8 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
82
82
  end
83
83
 
84
84
  @q.push([ false, '$' ])
85
- #p @q
86
85
 
86
+ query = nil
87
87
  begin
88
88
  query = do_parse
89
89
  rescue Racc::ParseError => e
@@ -199,7 +199,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
199
199
  tokens << token
200
200
  end
201
201
  if tokens.length == 0
202
- return nil
202
+ return TermQuery.new(Term.new(field, ""))
203
203
  elsif tokens.length == 1
204
204
  return TermQuery.new(Term.new(field, tokens[0].term_text))
205
205
  else
@@ -221,7 +221,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
221
221
  return FuzzyQuery.new(Term.new(field, token.term_text))
222
222
  end
223
223
  else
224
- return nil
224
+ return TermQuery.new(Term.new(field, ""))
225
225
  end
226
226
  end
227
227
 
@@ -402,7 +402,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
402
402
  return qp.parse(query)
403
403
  end
404
404
 
405
- ..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492
405
+ ..end lib/ferret/query_parser/query_parser.y modeval..id6e7f6ac20b
406
406
 
407
407
  ##### racc 1.4.4 generates ###
408
408