ferret 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/ext/Makefile +2 -2
  2. data/ext/ferret.c +27 -2
  3. data/ext/ferret.h +59 -16
  4. data/ext/ferret_ext.so +0 -0
  5. data/ext/index_io.c +72 -77
  6. data/ext/priority_queue.c +150 -145
  7. data/ext/ram_directory.c +47 -42
  8. data/ext/segment_merge_queue.c +4 -8
  9. data/ext/segment_term_enum.c +324 -0
  10. data/ext/similarity.c +59 -0
  11. data/ext/string_helper.c +2 -2
  12. data/ext/tags +150 -46
  13. data/ext/term.c +107 -152
  14. data/ext/term_buffer.c +105 -174
  15. data/ext/term_infos_reader.c +54 -0
  16. data/ext/terminfo.c +160 -0
  17. data/ext/token.c +93 -0
  18. data/lib/ferret.rb +1 -1
  19. data/lib/ferret/analysis/analyzers.rb +18 -0
  20. data/lib/ferret/analysis/standard_tokenizer.rb +19 -14
  21. data/lib/ferret/analysis/token.rb +8 -1
  22. data/lib/ferret/analysis/tokenizers.rb +10 -5
  23. data/lib/ferret/document/field.rb +33 -11
  24. data/lib/ferret/index/document_writer.rb +3 -2
  25. data/lib/ferret/index/field_infos.rb +38 -12
  26. data/lib/ferret/index/fields_io.rb +10 -4
  27. data/lib/ferret/index/index.rb +20 -4
  28. data/lib/ferret/index/index_reader.rb +19 -4
  29. data/lib/ferret/index/index_writer.rb +1 -1
  30. data/lib/ferret/index/multi_reader.rb +21 -7
  31. data/lib/ferret/index/segment_merge_info.rb +24 -22
  32. data/lib/ferret/index/segment_merge_queue.rb +2 -2
  33. data/lib/ferret/index/segment_merger.rb +28 -11
  34. data/lib/ferret/index/segment_reader.rb +19 -4
  35. data/lib/ferret/index/segment_term_enum.rb +3 -11
  36. data/lib/ferret/index/term_buffer.rb +13 -16
  37. data/lib/ferret/index/term_doc_enum.rb +8 -5
  38. data/lib/ferret/index/term_enum.rb +2 -2
  39. data/lib/ferret/index/term_info.rb +1 -5
  40. data/lib/ferret/index/term_infos_io.rb +2 -0
  41. data/lib/ferret/query_parser/query_parser.tab.rb +7 -7
  42. data/lib/ferret/search/phrase_scorer.rb +0 -1
  43. data/lib/ferret/search/similarity.rb +2 -2
  44. data/lib/ferret/search/term_scorer.rb +2 -2
  45. data/lib/ferret/store/directory.rb +2 -0
  46. data/lib/ferret/store/fs_store.rb +16 -3
  47. data/lib/ferret/store/ram_store.rb +2 -2
  48. data/test/unit/document/tc_field.rb +9 -0
  49. data/test/unit/index/tc_field_infos.rb +29 -21
  50. data/test/unit/index/tc_index.rb +44 -7
  51. data/test/unit/index/tc_term_buffer.rb +3 -3
  52. data/test/unit/index/tc_term_info.rb +1 -1
  53. data/test/unit/query_parser/tc_query_parser.rb +1 -1
  54. data/test/unit/search/tc_index_searcher.rb +3 -0
  55. data/test/unit/store/tc_fs_store.rb +47 -16
  56. data/test/unit/store/tc_ram_store.rb +1 -1
  57. metadata +8 -3
@@ -1,44 +1,46 @@
1
1
  module Ferret
2
2
  module Index
3
3
  class SegmentMergeInfo
4
- attr_reader :term, :term_enum, :reader, :postings, :doc_map, :base
4
+ attr_reader :term_enum, :reader, :base, :term_buffer
5
5
 
6
6
  def initialize(base, term_enum, reader)
7
7
  @base = base
8
8
  @reader = reader
9
9
  @term_enum = term_enum
10
- @term = term_enum.term()
11
- @postings = @reader.term_positions()
10
+ @term_buffer = term_enum.term_buffer
11
+ end
12
+
13
+ def positions
14
+ @postings ||= @reader.term_positions()
15
+ end
12
16
 
13
- # build array which maps document numbers around deletions
14
- if (@reader.has_deletions?())
15
- max_doc = @reader.max_doc()
16
- @doc_map = Array.new(max_doc)
17
- j = 0
18
- max_doc.times do |i|
19
- if (@reader.deleted?(i))
20
- @doc_map[i] = -1
21
- else
22
- @doc_map[i] = j
23
- j += 1
17
+ def doc_map
18
+ if @doc_map.nil?
19
+ # build array which maps document numbers around deletions
20
+ if (@reader.has_deletions?())
21
+ max_doc = @reader.max_doc()
22
+ @doc_map = Array.new(max_doc)
23
+ j = 0
24
+ max_doc.times do |i|
25
+ if (@reader.deleted?(i))
26
+ @doc_map[i] = -1
27
+ else
28
+ @doc_map[i] = j
29
+ j += 1
30
+ end
24
31
  end
25
32
  end
26
33
  end
34
+ return @doc_map
27
35
  end
28
36
 
29
37
  def next?
30
- if @term_enum.next?
31
- @term = @term_enum.term
32
- return true
33
- else
34
- @term = nil
35
- return false
36
- end
38
+ @term_enum.next?
37
39
  end
38
40
 
39
41
  def close()
40
42
  @term_enum.close()
41
- @postings.close()
43
+ @postings.close() if @postings
42
44
  @reader = nil
43
45
  end
44
46
  end
@@ -1,10 +1,10 @@
1
1
  module Ferret::Index
2
2
  class SegmentMergeQueue < Ferret::Utils::PriorityQueue
3
3
  def less_than(sti_a, sti_b)
4
- if sti_a.term == sti_b.term
4
+ if sti_a.term_buffer == sti_b.term_buffer
5
5
  return sti_a.base < sti_b.base
6
6
  else
7
- return sti_a.term < sti_b.term
7
+ return sti_a.term_buffer < sti_b.term_buffer
8
8
  end
9
9
  end
10
10
 
@@ -71,7 +71,7 @@ module Ferret::Index
71
71
 
72
72
  # Field norm files
73
73
  @field_infos.each_with_index do |fi, i|
74
- if (fi.indexed?)
74
+ if (fi.indexed? and not fi.omit_norms?)
75
75
  files << "#{@segment}.f#{i}"
76
76
  end
77
77
  end
@@ -94,6 +94,21 @@ module Ferret::Index
94
94
  return files
95
95
  end
96
96
 
97
+ def add_indexed(reader, field_infos, field_names,
98
+ store_term_vectors,
99
+ store_position_with_term_vector,
100
+ store_offset_with_term_vector)
101
+ field_names.each do |field|
102
+ field_infos.add(field, true,
103
+ store_term_vectors,
104
+ store_position_with_term_vector,
105
+ store_offset_with_term_vector,
106
+ !reader.has_norms?(field))
107
+ end
108
+ end
109
+ private :add_indexed
110
+
111
+
97
112
  #
98
113
  # returns:: The number of documents in all of the readers
99
114
  # raises:: IOError
@@ -101,11 +116,11 @@ module Ferret::Index
101
116
  @field_infos = FieldInfos.new() # merge field names
102
117
  doc_count = 0
103
118
  @readers.each do |reader|
104
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET), true, true, true, true)
105
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION), true, true, true, false)
106
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET), true, true, false, true)
107
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR), true, true, false, false)
108
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::INDEXED), true, false, false, false)
119
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET), true, true, true)
120
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION), true, true, false)
121
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET), true, false, true)
122
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR), true, false, false)
123
+ add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::INDEXED), false, false, false)
109
124
  @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::UNINDEXED), false)
110
125
  end
111
126
  @field_infos.write_to_dir(@directory, @segment + ".fnm")
@@ -186,10 +201,12 @@ module Ferret::Index
186
201
  match_size = 0 # pop matching terms
187
202
  match[match_size] = @queue.pop
188
203
  match_size += 1
189
- term = match[0].term
204
+ #term = match[0].term
205
+ term_buffer = match[0].term_buffer
190
206
  top = @queue.top
191
207
 
192
- while top and term == top.term
208
+ #while top and term == top.term
209
+ while top and term_buffer == top.term_buffer
193
210
  match[match_size] = @queue.pop
194
211
  match_size += 1
195
212
  top = @queue.top
@@ -227,7 +244,7 @@ module Ferret::Index
227
244
  if (df > 0)
228
245
  # add an enbegin to the dictionary with pointers to prox and freq files
229
246
  @term_info.set_values!(df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer))
230
- @term_infos_writer.add(smis[0].term, @term_info)
247
+ @term_infos_writer.add(smis[0].term_buffer.term, @term_info)
231
248
  end
232
249
  end
233
250
 
@@ -244,7 +261,7 @@ module Ferret::Index
244
261
  reset_skip()
245
262
  n.times do |i|
246
263
  smi = smis[i]
247
- postings = smi.postings
264
+ postings = smi.positions
248
265
  base = smi.base
249
266
  doc_map = smi.doc_map
250
267
 
@@ -315,7 +332,7 @@ module Ferret::Index
315
332
 
316
333
  def merge_norms()
317
334
  @field_infos.each_with_index do |fi, i|
318
- if (fi.indexed?)
335
+ if (fi.indexed? and not fi.omit_norms?)
319
336
  output = @directory.create_output(@segment + ".f" + i.to_s)
320
337
  begin
321
338
  @readers.each do |reader|
@@ -127,7 +127,7 @@ module Ferret::Index
127
127
  end
128
128
 
129
129
  @field_infos.each_with_index do |fi, i|
130
- if (fi.indexed?)
130
+ if (fi.indexed? and not fi.omit_norms?)
131
131
  if @cfs_reader.nil?
132
132
  name = "#{@segment}.f#{i}"
133
133
  else
@@ -228,10 +228,22 @@ module Ferret::Index
228
228
  return field_set
229
229
  end
230
230
 
231
+ def has_norms?(field)
232
+ return @norms.has_key?(field)
233
+ end
234
+
235
+ def SegmentReader.create_fake_norms(size)
236
+ Array.new(size, 1).pack("C*")
237
+ end
238
+
239
+ def fake_norms()
240
+ return @ones ||= SegmentReader.create_fake_norms(max_doc())
241
+ end
242
+
231
243
  def get_norms(field)
232
244
  synchronize do
233
245
  norm = @norms[field]
234
- if (norm == nil) # not an indexed field
246
+ if (norm == nil) # not an indexed field or omit norms
235
247
  return nil
236
248
  end
237
249
  if (norm.bytes == nil) # value not yet read
@@ -258,7 +270,10 @@ module Ferret::Index
258
270
  def get_norms_into(field, bytes, offset)
259
271
  synchronize do
260
272
  norm = @norms[field]
261
- return if (norm == nil) # use zeros in array
273
+ if (norm.nil?)
274
+ bytes[offset, max_doc()] = fake_norms[0, max_doc()]
275
+ return
276
+ end
262
277
 
263
278
  if (norm.bytes != nil) # can copy from cache
264
279
  bytes[offset, max_doc()] = norm.bytes[0, max_doc()]
@@ -277,7 +292,7 @@ module Ferret::Index
277
292
 
278
293
  def open_norms(cfs_dir)
279
294
  @field_infos.each do |fi|
280
- if (fi.indexed?)
295
+ if (fi.indexed? and not fi.omit_norms?)
281
296
  # look first if there are separate norms in compound format
282
297
  file_name = @segment + ".s" + fi.number.to_s
283
298
  d = @directory
@@ -15,11 +15,9 @@ module Ferret::Index
15
15
 
16
16
  @term_buffer = TermBuffer.new()
17
17
  @prev_buffer = TermBuffer.new()
18
- @scratch = nil # used for scanning
19
18
  @term_info = TermInfo.new()
20
19
 
21
20
  @index_pointer = 0
22
- @format_m1skip_interval = nil
23
21
 
24
22
  first_int = @input.read_int()
25
23
 
@@ -61,8 +59,7 @@ module Ferret::Index
61
59
 
62
60
  #attr_accessors for the clone method
63
61
  attr_accessor :input, :term_buffer, :prev_buffer
64
- protected :input, :input=, :term_buffer,
65
- :term_buffer=, :prev_buffer, :prev_buffer=
62
+ protected :input, :input=, :prev_buffer, :prev_buffer=
66
63
 
67
64
  def initialize_copy(o)
68
65
  super
@@ -83,7 +80,7 @@ module Ferret::Index
83
80
  # Increments the enumeration to the next element. True if one exists.
84
81
  def next?
85
82
  @position += 1
86
- if (@position > @size - 1)
83
+ if (@position >= @size)
87
84
  @term_buffer.reset()
88
85
  return false
89
86
  end
@@ -117,13 +114,8 @@ module Ferret::Index
117
114
  return true
118
115
  end
119
116
 
120
- # Optimized scan, without allocating new terms.
121
117
  def scan_to(term)
122
- if (@scratch == nil)
123
- @scratch = TermBuffer.new()
124
- end
125
- @scratch.term = term
126
- while (@scratch > @term_buffer and next?) do
118
+ while (term > @term_buffer and next?) do
127
119
  end
128
120
  end
129
121
 
@@ -2,21 +2,21 @@ module Ferret::Index
2
2
  class TermBuffer
3
3
  include Comparable
4
4
 
5
- attr_reader :text, :text_length, :field
5
+ attr_reader :text_buf, :text_length, :field
6
6
 
7
7
  def initialize
8
- @text = String.new
8
+ @text_buf = String.new
9
9
  @text_length = -1
10
10
  @field = nil
11
11
  end
12
12
 
13
13
  def hash()
14
- return @text.hash + @field.hash
14
+ return text.hash + @field.hash
15
15
  end
16
16
 
17
17
  def <=>(other)
18
18
  if (@field == other.field)
19
- return text_str <=> other.text_str
19
+ return text <=> other.text
20
20
  end
21
21
  @field <=> other.field
22
22
  end
@@ -27,7 +27,7 @@ module Ferret::Index
27
27
  length = input.read_vint()
28
28
  total_length = start + length
29
29
  @text_length = total_length
30
- input.read_chars(@text, start, length)
30
+ input.read_chars(@text_buf, start, length)
31
31
  @field = field_infos[input.read_vint()].name
32
32
  end
33
33
 
@@ -38,8 +38,8 @@ module Ferret::Index
38
38
  end
39
39
 
40
40
  # copy text into the buffer
41
- @text_length = term.text.length
42
- @text = term.text.clone
41
+ @text_buf = term.text.clone
42
+ @text_length = @text_buf.length
43
43
 
44
44
  @field = term.field
45
45
  @term = term
@@ -47,14 +47,15 @@ module Ferret::Index
47
47
 
48
48
  def set!(other)
49
49
  @text_length = other.text_length
50
- @text = other.text.clone if other.text
50
+ @text_buf = other.text_buf.clone if other.text_buf
51
51
  @field = other.field
52
52
  @term = other.term
53
53
  end
54
+ alias :initialize_copy :set!
54
55
 
55
56
  def reset()
56
57
  @field = nil
57
- @text = String.new
58
+ @text_buf = ""
58
59
  @text_length = 0
59
60
  @term = nil
60
61
  end
@@ -65,18 +66,14 @@ module Ferret::Index
65
66
  end
66
67
 
67
68
  if @term.nil?
68
- @term = Term.new(@field, @text[0,@text_length].to_s)
69
+ @term = Term.new(@field, @text_buf[0,@text_length].to_s)
69
70
  end
70
71
  return @term
71
72
  end
72
73
  alias :term :to_term
73
74
 
74
- def initialize_copy(o)
75
- set!(o)
76
- end
77
-
78
- def text_str()
79
- @text[0,@text_length]
75
+ def text()
76
+ @text_buf[0,@text_length]
80
77
  end
81
78
 
82
79
  def to_s()
@@ -73,13 +73,16 @@ module Ferret::Index
73
73
  if t.instance_of?(Term)
74
74
  ti = parent.term_infos[t]
75
75
  elsif t.is_a?(TermEnum)
76
+ ti = t.term_info()
77
+ # The following is being done in the Java version. I don't think it's
78
+ # necessary.
76
79
  # use comparison of fieldinfos to verify that term enum (t) belongs to the
77
80
  # same segment as this SegmentTermDocEnum
78
- if (t.instance_of?(SegmentTermEnum) and t.field_infos == parent.field_infos)
79
- ti = t.term_info()
80
- else # punt case
81
- ti = parent.term_infos[t.term]
82
- end
81
+ #if (t.instance_of?(SegmentTermEnum) and t.field_infos == parent.field_infos)
82
+ # ti = t.term_info()
83
+ #else # punt case
84
+ # ti = parent.term_infos[t.term]
85
+ #end
83
86
  elsif t.is_a? TermInfo # this one is easy. That's exactly what we're looking for
84
87
  ti = t
85
88
  else
@@ -33,7 +33,7 @@ module Ferret
33
33
  #
34
34
  # Behaves as if written:
35
35
  #
36
- # def skip_to(target_term)
36
+ # def skip_to(target)
37
37
  # while (target > term)
38
38
  # if (!next()) return false
39
39
  # end
@@ -41,7 +41,7 @@ module Ferret
41
41
  # end
42
42
  #
43
43
  # Some implementations are considerably more efficient than that.
44
- def skip_to(term)
44
+ def skip_to(target)
45
45
  while (target > term)
46
46
  return false if not next?
47
47
  end
@@ -21,10 +21,6 @@ module Ferret::Index
21
21
  @skip_offset = so
22
22
  end
23
23
 
24
- def copy_of()
25
- TermInfo.new(doc_freq, freq_pointer, prox_pointer, skip_offset)
26
- end
27
-
28
24
  def ==(o)
29
25
  return false if !o.instance_of?(TermInfo)
30
26
  @doc_freq == o.doc_freq &&
@@ -35,7 +31,7 @@ module Ferret::Index
35
31
  alias eql? ==
36
32
 
37
33
  def to_s()
38
- "TermInfo:df=#{@doc_freq}:fp=#{@freq_pointer}:pp=#{@prox_pointer}:so=#{@skip_offset}"
34
+ "TermInfo:df=#{doc_freq}:fp=#{freq_pointer}:pp=#{prox_pointer}:so=#{skip_offset}"
39
35
  end
40
36
  end
41
37
  end
@@ -252,6 +252,8 @@ module Ferret::Index
252
252
 
253
253
  # Returns the offset of the greatest index entry which is less than or
254
254
  # equal to term.
255
+ #
256
+ # This method is rewritten in the C extension.
255
257
  def get_index_offset(term)
256
258
  lo = 0 # binary search @index_terms[]
257
259
  hi = @index_terms.length - 1
@@ -11,7 +11,7 @@ module Ferret
11
11
 
12
12
  class QueryParser < Racc::Parser
13
13
 
14
- module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492', 'lib/ferret/query_parser/query_parser.y', 126
14
+ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id6e7f6ac20b', 'lib/ferret/query_parser/query_parser.y', 126
15
15
  attr_accessor :default_field, :fields, :handle_parse_errors
16
16
 
17
17
  def initialize(default_field = "*", options = {})
@@ -53,11 +53,11 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
53
53
  case str
54
54
  when /\A\s+/
55
55
  ;
56
- when /\A[#{ECHR}]/
56
+ when /\A([#{EWCHR}]|[*?](?=:))/
57
57
  @q.push [ RESERVED[$&]||$&, $& ]
58
58
  when /\A(\&\&|\|\|)/
59
59
  @q.push [ RESERVED[$&], $& ]
60
- when /\A(\\[#{ECHR}]|[^\s#{ECHR}])+[?*](\\[#{EWCHR}]|[^\s#{EWCHR}])*/
60
+ when /\A(\\[#{ECHR}]|[^\s#{ECHR}])*[?*](\\[#{EWCHR}]|[^\s#{EWCHR}])*/
61
61
  str = $'
62
62
  unescaped = $&.gsub(/\\(?!\\)/,"")
63
63
  @q.push [ :WILD_STRING, unescaped ]
@@ -82,8 +82,8 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
82
82
  end
83
83
 
84
84
  @q.push([ false, '$' ])
85
- #p @q
86
85
 
86
+ query = nil
87
87
  begin
88
88
  query = do_parse
89
89
  rescue Racc::ParseError => e
@@ -199,7 +199,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
199
199
  tokens << token
200
200
  end
201
201
  if tokens.length == 0
202
- return nil
202
+ return TermQuery.new(Term.new(field, ""))
203
203
  elsif tokens.length == 1
204
204
  return TermQuery.new(Term.new(field, tokens[0].term_text))
205
205
  else
@@ -221,7 +221,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
221
221
  return FuzzyQuery.new(Term.new(field, token.term_text))
222
222
  end
223
223
  else
224
- return nil
224
+ return TermQuery.new(Term.new(field, ""))
225
225
  end
226
226
  end
227
227
 
@@ -402,7 +402,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd4349
402
402
  return qp.parse(query)
403
403
  end
404
404
 
405
- ..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492
405
+ ..end lib/ferret/query_parser/query_parser.y modeval..id6e7f6ac20b
406
406
 
407
407
  ##### racc 1.4.4 generates ###
408
408