ferret 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.2.1'
25
+ VERSION = '0.2.2'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -32,9 +32,8 @@ module Ferret::Analysis
32
32
  # An array containing some common English words that are not usually useful
33
33
  # for searching.
34
34
  ENGLISH_STOP_WORDS = [
35
- "a", "an", "and", "are", "as", "at", "be", "but", "by",
36
- "for", "if", "in", "into", "is", "it",
37
- "no", "not", "of", "on", "or", "s", "such",
35
+ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
36
+ "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such",
38
37
  "t", "that", "the", "their", "then", "there", "these",
39
38
  "they", "this", "to", "was", "will", "with"
40
39
  ]
@@ -51,6 +50,8 @@ module Ferret::Analysis
51
50
  end
52
51
 
53
52
  # An Analyzer that filters LetterTokenizer with LowerCaseFilter.
53
+ # This analyzer subclasses the StopAnalyzer so you can add your own
54
+ # stoplist the same way. See StopAnalyzer.
54
55
  class StandardAnalyzer < StopAnalyzer
55
56
  def token_stream(field, string)
56
57
  return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
@@ -84,7 +85,7 @@ module Ferret::Analysis
84
85
  def token_stream(field, string)
85
86
  analyzer = @analyzers[field]
86
87
  if (analyzer == nil)
87
- analyzer = @default_analyzer;
88
+ analyzer = @default_analyzer
88
89
  end
89
90
 
90
91
  return analyzer.token_stream(field, string)
@@ -277,28 +277,15 @@ module Ferret::Document
277
277
  str = ""
278
278
  if (@stored)
279
279
  str << "stored"
280
- @str << @compressed ? "/compressed," : "/uncompressed,"
280
+ str << (@compressed ? "/compressed," : "/uncompressed,")
281
281
  end
282
- if (@indexed) then str << "indexed," end
283
- if (@tokenized) then str << "tokenized," end
284
- if (@store_term_vector) then str << "store_term_vector," end
285
- if (@store_offset)
286
- str << "term_vector_offsets,"
287
- end
288
- if (@store_position)
289
- str << "term_vector_position,"
290
- end
291
- if (@binary) then str << "binary," end
292
-
293
- str << '<'
294
- str << @name
295
- str << ':'
296
-
297
- if (@data != null)
298
- str << @data.to_s
299
- end
300
-
301
- str << '>'
282
+ str << "indexed," if (@indexed)
283
+ str << "tokenized," if (@tokenized)
284
+ str << "store_term_vector," if (@store_term_vector)
285
+ str << "tv_offset," if (@store_offset)
286
+ str << "tv_position," if (@store_position)
287
+ str << "binary," if (@binary)
288
+ str << "<#{@name}:#{data}>"
302
289
  end
303
290
  end
304
291
  end
@@ -107,10 +107,10 @@ module Ferret::Index
107
107
  end
108
108
 
109
109
  # Not implemented
110
- def delete(name) raise(UnsupportedOperationError) end
110
+ def remove(name) raise(NotImplementedError) end
111
111
 
112
112
  # Not implemented
113
- def rename(from, to) raise(UnsupportedOperationError) end
113
+ def rename(from, to) raise(NotImplementedError) end
114
114
 
115
115
  # Returns the length of a file in the directory.
116
116
  def length(name)
@@ -120,10 +120,10 @@ module Ferret::Index
120
120
  end
121
121
 
122
122
  # Not implemented
123
- def create_output(name) raise(UnsupportedOperationError) end
123
+ def create_output(name) raise(NotImplementedError) end
124
124
 
125
125
  # Not implemented
126
- def make_lock(name) raise(UnsupportedOperationError) end
126
+ def make_lock(name) raise(NotImplementedError) end
127
127
 
128
128
  # Implementation of an IndexInput that reads from a portion of the
129
129
  # compound file.
@@ -206,8 +206,8 @@ module Ferret::Index
206
206
  # Add a source stream. _file_name_ is the string by which the
207
207
  # sub-stream will be known in the compound stream.
208
208
  #
209
- # Throws:: StateError if this writer is closed
210
- # Throws:: ArgumentError if a file with the same name
209
+ # Raises:: StateError if this writer is closed
210
+ # Raises:: ArgumentError if a file with the same name
211
211
  # has been added already
212
212
  def add_file(file_name)
213
213
  if @merged
@@ -253,7 +253,7 @@ module Ferret::Index
253
253
  # Remember the positions of directory entries so that we can
254
254
  # adjust the offsets later
255
255
  @file_entries.each do |fe|
256
- fe.directory_offset = os.pos()
256
+ fe.dir_offset = os.pos()
257
257
  os.write_long(0) # for now
258
258
  os.write_string(fe.file_name)
259
259
  end
@@ -267,7 +267,7 @@ module Ferret::Index
267
267
 
268
268
  # Write the data offsets into the directory of the compound stream
269
269
  @file_entries.each do |fe|
270
- os.seek(fe.directory_offset)
270
+ os.seek(fe.dir_offset)
271
271
  os.write_long(fe.data_offset)
272
272
  end
273
273
 
@@ -292,15 +292,7 @@ module Ferret::Index
292
292
  private
293
293
 
294
294
  # Internal class for holding a file
295
- class FileEntry
296
-
297
- attr_accessor :file_name, :directory_offset, :data_offset
298
-
299
- def initialize(file_name)
300
- @file_name = file_name
301
- end
302
-
303
- end
295
+ FileEntry = Struct.new(:file_name, :dir_offset, :data_offset)
304
296
 
305
297
  # Copy the contents of the file with specified extension into the
306
298
  # provided output stream. Use a buffer for moving data
@@ -324,9 +316,9 @@ module Ferret::Index
324
316
  # Verify that remainder is 0
325
317
  if (remainder != 0)
326
318
  raise(IOError,
327
- "Non-zero remainder length after copying: " + remainder.to_s +
328
- " (id: " + source.file_name + ", length: " + length.to_s +
329
- ", buffer size: " + Ferret::Store::BUFFER_SIZE.to_s + ")")
319
+ "Non-zero remainder length after copying: #{remainder} " +
320
+ "(id: #{source.file_name}, length: #{length}, buffer size: " +
321
+ " #{Ferret::Store::BUFFER_SIZE})")
330
322
  end
331
323
 
332
324
  # Verify that the output length diff is equal to original file
@@ -334,8 +326,8 @@ module Ferret::Index
334
326
  diff = end_ptr - start_ptr
335
327
  if (diff != length)
336
328
  raise(IOError,
337
- "Difference in the output file offsets " + diff.to_s +
338
- " does not match the original file length " + length.to_s)
329
+ "Difference in the output file offsets #{diff}" +
330
+ " does not match the original file length #{length}")
339
331
  end
340
332
 
341
333
  ensure
@@ -76,6 +76,23 @@ module Ferret::Index
76
76
  # be replaced by the new object. This will slow
77
77
  # down indexing so it should not be used if
78
78
  # performance is a concern.
79
+ # use_compound_file:: Uses a compound file to store the index. This
80
+ # prevents an error being raised for having too
81
+ # many files open at the same time. The default is
82
+ # true but performance is better if this is set to
83
+ # false.
84
+ # handle_parse_errors:: Set this to true if you want the QueryParser to
85
+ # degrade gracefully on errors. If the query parser
86
+ # fails to parse this query, it will try to parse
87
+ # it as a straight boolean query on the default
88
+ # field ignoring all query punctuation. If this
89
+ # fails, it will return an empty TermQuery. If you
90
+ # use this and you need to know why your query
91
+ # isn't working you can use the Query#to_s method
92
+ # on the query returned to see what is happening to
93
+ # your query. This defualts to true. If you set it
94
+ # to false a QueryParseException is raised on a
95
+ # query parse error.
79
96
  #
80
97
  # Some examples;
81
98
  #
@@ -86,7 +103,8 @@ module Ferret::Index
86
103
  #
87
104
  # index = Index::Index.new(:dir => directory,
88
105
  # :close_dir => false
89
- # :default_slop => 2)
106
+ # :default_slop => 2,
107
+ # :handle_parse_errors => false)
90
108
  #
91
109
  def initialize(options = {})
92
110
  super()
@@ -117,6 +135,7 @@ module Ferret::Index
117
135
  @default_search_field = (@options[:default_search_field] || \
118
136
  @options[:default_field] || "*")
119
137
  @default_field = @options[:default_field] || ""
138
+ @options[:handle_parse_errors] = true if @options[:handle_parse_errors].nil?
120
139
  @open = true
121
140
  @qp = nil
122
141
  end
@@ -100,7 +100,7 @@ module Ferret::Index
100
100
  if directory.nil?
101
101
  directory = Ferret::Store::RAMDirectory.new
102
102
  elsif directory.is_a?(String)
103
- directory = Ferret::Store::FSDirectory.new(directory, true)
103
+ directory = Ferret::Store::FSDirectory.new(directory, false)
104
104
  end
105
105
  directory.synchronize do # in- & inter-process sync
106
106
  commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
@@ -83,21 +83,21 @@ module Index
83
83
  @close_dir = options[:close_dir] || false
84
84
  @use_compound_file = (options[:use_compound_file] != false) # ie default true
85
85
  @analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
86
- @merge_factor = DEFAULT_MERGE_FACTOR
87
- @min_merge_docs = DEFAULT_MIN_MERGE_DOCS
88
- @max_merge_docs = DEFAULT_MAX_MERGE_DOCS
89
- @max_field_length = DEFAULT_MAX_FIELD_LENGTH
90
- @term_index_interval = DEFAULT_TERM_INDEX_INTERVAL
86
+ @merge_factor = options[:merge_factor] || DEFAULT_MERGE_FACTOR
87
+ @min_merge_docs = options[:min_merge_docs] || DEFAULT_MIN_MERGE_DOCS
88
+ @max_merge_docs = options[:max_merge_docs] || DEFAULT_MAX_MERGE_DOCS
89
+ @max_field_length = options[:max_field_length] || DEFAULT_MAX_FIELD_LENGTH
90
+ @term_index_interval = options[:term_index_interval] || DEFAULT_TERM_INDEX_INTERVAL
91
91
 
92
92
  @similarity = Search::Similarity.default
93
93
  @segment_infos = SegmentInfos.new()
94
94
  @ram_directory = Ferret::Store::RAMDirectory.new()
95
95
 
96
96
  # Make sure that the lock is released when this object is destroyed
97
- define_finalizer(self, proc { |id| @write_lock.release() if @write_lock})
98
97
 
99
98
  @write_lock = @directory.make_lock(WRITE_LOCK_NAME)
100
99
  @write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
100
+ define_finalizer(@write_lock, proc { |id| @write_lock.release() if @write_lock})
101
101
 
102
102
  @directory.synchronize() do # in- & inter-process sync
103
103
  @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
@@ -16,16 +16,17 @@ module Ferret::Index
16
16
  @segment = info.name
17
17
 
18
18
  @cfs_reader = nil
19
- cfs = directory
20
- if directory.exists?(@segment + '.cfs') then
19
+ dir = directory
20
+ #if directory.exists?(@segment + '.cfs') then
21
+ if SegmentReader.uses_compound_file?(info)
21
22
  @cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
22
- cfs = @cfs_reader
23
+ dir = @cfs_reader
23
24
  end
24
25
 
25
- @field_infos = FieldInfos.new(cfs, @segment + '.fnm')
26
- @fields_reader = FieldsReader.new(cfs, @segment, @field_infos)
26
+ @field_infos = FieldInfos.new(dir, @segment + '.fnm')
27
+ @fields_reader = FieldsReader.new(dir, @segment, @field_infos)
27
28
 
28
- @term_infos = TermInfosReader.new(cfs, @segment, @field_infos)
29
+ @term_infos = TermInfosReader.new(dir, @segment, @field_infos)
29
30
  @deleted_docs = nil
30
31
  @deleted_docs_dirty = false
31
32
  if SegmentReader.has_deletions?(info) then
@@ -33,16 +34,16 @@ module Ferret::Index
33
34
  Ferret::Utils::BitVector.read(directory, @segment + '.del')
34
35
  end
35
36
 
36
- @freq_stream = cfs.open_input(@segment + '.frq')
37
- @prox_stream = cfs.open_input(@segment + '.prx')
37
+ @freq_stream = dir.open_input(@segment + '.frq')
38
+ @prox_stream = dir.open_input(@segment + '.prx')
38
39
  @norms = {}
39
40
  @norms.extend(MonitorMixin)
40
41
  @norms_dirty = false
41
- open_norms(cfs)
42
+ open_norms(dir)
42
43
 
43
44
  @tv_reader_orig = nil
44
45
  if @field_infos.has_vectors? then
45
- @tv_reader_orig = TermVectorsReader.new(cfs, @segment, @field_infos)
46
+ @tv_reader_orig = TermVectorsReader.new(dir, @segment, @field_infos)
46
47
  end
47
48
  end
48
49
 
@@ -128,9 +129,9 @@ module Ferret::Index
128
129
  @field_infos.each_with_index do |fi, i|
129
130
  if (fi.indexed?)
130
131
  if @cfs_reader.nil?
131
- name = @segment + ".f" + i.to_s
132
+ name = "#{@segment}.f#{i}"
132
133
  else
133
- name = @segment + ".s" + i.to_s
134
+ name = "#{@segment}.s#{i}"
134
135
  end
135
136
  if (@directory.exists?(name))
136
137
  file_names << name
@@ -242,17 +242,29 @@ module Ferret
242
242
  #
243
243
  # === Options
244
244
  #
245
- # analyzer:: The analyzer is used to break phrases up into terms and
246
- # to turn terms in tokens recognized in the index.
247
- # Analysis::Analyzer is the default
248
- # occur_default:: Set to either BooleanClause::Occur::SHOULD (default)
249
- # or BooleanClause::Occur::MUST to specify the default
250
- # Occur operator.
251
- # wild_lower:: Set to false if you don't want the terms in fuzzy and
252
- # wild queries to be set to lower case. You should do this
253
- # if your analyzer doesn't downcase. The default is true.
254
- # default_slop:: Set the default slop for phrase queries. This defaults
255
- # to 0.
245
+ # analyzer:: The analyzer is used to break phrases up into
246
+ # terms and to turn terms in tokens recognized in
247
+ # the index. Analysis::Analyzer is the default
248
+ # occur_default:: Set to either BooleanClause::Occur::SHOULD
249
+ # (default) or BooleanClause::Occur::MUST to specify
250
+ # the default Occur operator.
251
+ # wild_lower:: Set to false if you don't want the terms in fuzzy
252
+ # and wild queries to be set to lower case. You
253
+ # should do this if your analyzer doesn't downcase.
254
+ # The default is true.
255
+ # default_slop:: Set the default slop for phrase queries. This
256
+ # defaults to 0.
257
+ # handle_parse_errors:: Set this to true if you want the QueryParser to
258
+ # degrade gracefully on errors. If the query parser
259
+ # fails to parse this query, it will try to parse it
260
+ # as a straight boolean query on the default field
261
+ # ignoring all query punctuation. If this fails, it
262
+ # will return an empty TermQuery. If you use this
263
+ # and you need to know why your query isn't working
264
+ # you can use the Query#to_s method on the query
265
+ # returned to see what is happening to your query.
266
+ # This defualts to false, in which case a
267
+ # QueryParseException is thrown.
256
268
  def initialize(default_field = "", options = {})
257
269
  end
258
270
 
@@ -263,10 +275,10 @@ module Ferret
263
275
 
264
276
  # Set to false if you don't want the terms in fuzzy and wild queries to be
265
277
  # set to lower case. You should do this if your analyzer doesn't downcase.
266
- def wild_lower()
278
+ def wild_lower=()
267
279
  end
268
280
 
269
- # Returns the value of wild_lower. See #wild_lower.
281
+ # Returns the value of wild_lower. See #wild_lower=.
270
282
  def wild_lower?()
271
283
  end
272
284
 
@@ -276,7 +288,25 @@ module Ferret
276
288
  # if you'd like to do your own query string cleaning.
277
289
  def clean_string(str)
278
290
  end
291
+
292
+ # The exception thrown when there is an error parsing the query string.
293
+ # This also holds the Racc::ParseError that was thrown in case you want to
294
+ # investigate why a query won't parse.
295
+ class QueryParseException < Exception
296
+ attr_reader :parse_error
297
+
298
+ # Create a new QueryParseException
299
+ #
300
+ # error:: An error string describing the query that failed
301
+ # parse_error:: The actual parse error that was thrown by Racc. It is a
302
+ # Racc::ParseError object.
303
+ def initialize(error, parse_error)
304
+ super(error)
305
+ @parse_error = parse_error
306
+ end
307
+ end
279
308
  end
309
+
280
310
  end
281
311
 
282
312
  require 'ferret/query_parser/query_parser.tab.rb'
@@ -11,15 +11,8 @@ module Ferret
11
11
 
12
12
  class QueryParser < Racc::Parser
13
13
 
14
- module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d44076', 'lib/ferret/query_parser/query_parser.y', 126
15
- attr_accessor :default_field, :fields
16
-
17
- # true if you want to downcase wild card queries. This is set to try by
18
- # default.
19
- attr_writer :wild_lower
20
-
21
- def wild_lower?() @wild_lower end
22
-
14
+ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492', 'lib/ferret/query_parser/query_parser.y', 126
15
+ attr_accessor :default_field, :fields, :handle_parse_errors
23
16
 
24
17
  def initialize(default_field = "*", options = {})
25
18
  @yydebug = true
@@ -32,6 +25,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
32
25
  @occur_default = options[:occur_default] || BooleanClause::Occur::SHOULD
33
26
  @default_slop = options[:default_slop] || 0
34
27
  @fields = options[:fields]||[]
28
+ @handle_parse_errors = options[:handle_parse_errors] || false
35
29
  end
36
30
 
37
31
  RESERVED = {
@@ -50,6 +44,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
50
44
  EWCHR = %q,:()\[\]{}!+"~^\-\|<>\=,
51
45
 
52
46
  def parse(str)
47
+ orig_str = str
53
48
  str = clean_string(str)
54
49
  str.strip!
55
50
  @q = []
@@ -82,10 +77,24 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
82
77
  end
83
78
  str = $'
84
79
  end
85
- @q.push [ false, '$' ]
80
+ if @q.empty?
81
+ return TermQuery.new(Term.new(@default_field, ""))
82
+ end
83
+
84
+ @q.push([ false, '$' ])
86
85
  #p @q
87
86
 
88
- do_parse
87
+ begin
88
+ query = do_parse
89
+ rescue Racc::ParseError => e
90
+ if @handle_parse_errors
91
+ @field = @default_field
92
+ query = _get_bad_query(orig_str)
93
+ else
94
+ raise QueryParseException.new("Could not parse #{str}", e)
95
+ end
96
+ end
97
+ return query
89
98
  end
90
99
 
91
100
  def next_token
@@ -160,6 +169,25 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
160
169
  return new_str.pack("c*")
161
170
  end
162
171
 
172
+ def get_bad_query(field, str)
173
+ tokens = []
174
+ stream = @analyzer.token_stream(field, str)
175
+ while token = stream.next
176
+ tokens << token
177
+ end
178
+ if tokens.length == 0
179
+ return TermQuery.new(Term.new(field, ""))
180
+ elsif tokens.length == 1
181
+ return TermQuery.new(Term.new(field, tokens[0].term_text))
182
+ else
183
+ bq = BooleanQuery.new()
184
+ tokens.each do |token|
185
+ bq << BooleanClause.new(TermQuery.new(Term.new(field, token.term_text)))
186
+ end
187
+ return bq
188
+ end
189
+ end
190
+
163
191
  def get_range_query(field, start_word, end_word, inc_upper, inc_lower)
164
192
  RangeQuery.new(field, start_word, end_word, inc_upper, inc_lower)
165
193
  end
@@ -374,7 +402,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
374
402
  return qp.parse(query)
375
403
  end
376
404
 
377
- ..end lib/ferret/query_parser/query_parser.y modeval..id9e08d44076
405
+ ..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492
378
406
 
379
407
  ##### racc 1.4.4 generates ###
380
408
 
@@ -893,7 +921,8 @@ if __FILE__ == $0
893
921
 
894
922
  parser = Ferret::QueryParser.new("default",
895
923
  :fields => ["f1", "f2", "f3"],
896
- :analyzer => Ferret::Analysis::StandardAnalyzer.new)
924
+ :analyzer => Ferret::Analysis::StandardAnalyzer.new,
925
+ :handle_parse_errors => true)
897
926
 
898
927
  $stdin.each do |line|
899
928
  query = parser.parse(line)
@@ -90,12 +90,17 @@ module Ferret::Search
90
90
  filter = options[:filter]
91
91
  first_doc = options[:first_doc]||0
92
92
  num_docs = options[:num_docs]||10
93
+ max_size = first_doc + num_docs
93
94
  sort = options[:sort]
94
95
 
95
- if (num_docs <= 0) # nil might be returned from hq.top() below.
96
+ if (num_docs <= 0)
96
97
  raise ArgumentError, "num_docs must be > 0 to run a search"
97
98
  end
98
99
 
100
+ if (first_doc < 0)
101
+ raise ArgumentError, "first_doc must be >= 0 to run a search"
102
+ end
103
+
99
104
  scorer = query.weight(self).scorer(@reader)
100
105
  if (scorer == nil)
101
106
  return TopDocs.new(0, [])
@@ -104,33 +109,32 @@ module Ferret::Search
104
109
  bits = (filter.nil? ? nil : filter.bits(@reader))
105
110
  if (sort)
106
111
  fields = sort.is_a?(Array) ? sort : sort.fields
107
- hq = FieldSortedHitQueue.new(@reader, fields, num_docs + first_doc)
112
+ hq = FieldSortedHitQueue.new(@reader, fields, max_size)
108
113
  else
109
- hq = HitQueue.new(num_docs + first_doc)
114
+ hq = HitQueue.new(max_size)
110
115
  end
111
116
  total_hits = 0
112
117
  min_score = 0.0
113
118
  scorer.each_hit() do |doc, score|
114
119
  if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
115
120
  total_hits += 1
116
- if hq.size < num_docs or score >= min_score
121
+ if hq.size < max_size or score >= min_score
117
122
  hq.insert(ScoreDoc.new(doc, score))
118
123
  min_score = hq.top.score # maintain min_score
119
124
  end
120
125
  end
121
126
  end
122
127
 
123
- score_docs = Array.new(hq.size)
128
+ score_docs = []
124
129
  if (hq.size > first_doc)
125
- score_docs = Array.new(hq.size - first_doc)
126
- first_doc.times { hq.pop }
127
- (hq.size - 1).downto(0) do |i|
128
- score_docs[i] = hq.pop
130
+ if (hq.size - first_doc) < num_docs
131
+ num_docs = hq.size - first_doc
132
+ end
133
+ num_docs.times do
134
+ score_docs.unshift(hq.pop)
129
135
  end
130
- else
131
- score_docs = []
132
- hq.clear
133
136
  end
137
+ hq.clear
134
138
 
135
139
  return TopDocs.new(total_hits, score_docs)
136
140
  end
@@ -25,8 +25,6 @@ module Ferret::Search
25
25
  @weight = weight
26
26
  @term_docs = td
27
27
  @norms = norms
28
- #XXX
29
- @norms_size = @norms.size
30
28
  @weight_value = weight.value
31
29
 
32
30
  SCORE_CACHE_SIZE.times do |i|
@@ -37,13 +37,15 @@ class FieldTest < Test::Unit::TestCase
37
37
  assert_equal(false, f.store_offsets?)
38
38
  assert_equal(false, f.store_positions?)
39
39
  assert_equal(false, f.binary?)
40
+ assert_equal("stored/compressed,indexed,tokenized,<name:value>", f.to_s)
40
41
  end
41
42
 
42
43
  def test_set_store()
43
- f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
44
+ f = Field.new("name", nil, Field::Store::COMPRESS, Field::Index::TOKENIZED)
44
45
  f.stored = Field::Store::NO
45
46
  assert_equal(false, f.stored?)
46
47
  assert_equal(false, f.compressed?)
48
+ assert_equal("indexed,tokenized,<name:>", f.to_s)
47
49
  end
48
50
 
49
51
  def test_set_index()
@@ -51,6 +53,7 @@ class FieldTest < Test::Unit::TestCase
51
53
  f.index = Field::Index::NO
52
54
  assert_equal(false, f.indexed?)
53
55
  assert_equal(false, f.tokenized?)
56
+ assert_equal("stored/compressed,<name:value>", f.to_s)
54
57
  end
55
58
 
56
59
  def test_set_term_vector()
@@ -59,6 +62,7 @@ class FieldTest < Test::Unit::TestCase
59
62
  assert_equal(true, f.store_term_vector?)
60
63
  assert_equal(true, f.store_offsets?)
61
64
  assert_equal(true, f.store_positions?)
65
+ assert_equal("stored/compressed,indexed,tokenized,store_term_vector,tv_offset,tv_position,<name:value>", f.to_s)
62
66
  end
63
67
 
64
68
  def test_new_binary_field()
@@ -76,5 +80,6 @@ class FieldTest < Test::Unit::TestCase
76
80
  assert_equal(false, f.store_offsets?)
77
81
  assert_equal(false, f.store_positions?)
78
82
  assert_equal(true, f.binary?)
83
+ assert_equal("stored/uncompressed,binary,<name:#{bin}>", f.to_s)
79
84
  end
80
85
  end
@@ -5,6 +5,7 @@ class QueryParserTest < Test::Unit::TestCase
5
5
  def test_strings()
6
6
  parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2", "f3"])
7
7
  pairs = [
8
+ ['', ''],
8
9
  ['word', 'word'],
9
10
  ['field:word', 'field:word'],
10
11
  ['"word1 word2 word3"', '"word word word"'],
@@ -92,8 +93,8 @@ class QueryParserTest < Test::Unit::TestCase
92
93
  ['"onewordphrase"', 'onewordphrase']
93
94
  ]
94
95
 
95
- pairs.each do |pair|
96
- assert_equal(pair[1], parser.parse(pair[0]).to_s(parser.default_field))
96
+ pairs.each do |query_str, expected|
97
+ assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
97
98
  end
98
99
  end
99
100
 
@@ -105,8 +106,32 @@ class QueryParserTest < Test::Unit::TestCase
105
106
  ['key:(1234)', 'key:1234']
106
107
  ]
107
108
 
108
- pairs.each do |pair|
109
- assert_equal(pair[1], parser.parse(pair[0]).to_s(parser.default_field))
109
+ pairs.each do |query_str, expected|
110
+ assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
111
+ end
112
+ end
113
+
114
+ def do_test_query_parse_exception_raised(str)
115
+ parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2", "f3"])
116
+ assert_raise(Ferret::QueryParser::QueryParseException) do
117
+ parser.parse(str)
118
+ end
119
+ end
120
+
121
+
122
+ def test_bad_queries
123
+ parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2"],
124
+ :handle_parse_errors => true)
125
+
126
+ pairs = [
127
+ ['(*word', 'word'],
128
+ ['()*&)(*^&*(', ''],
129
+ ['()*&one)(*two(*&"', 'one two']
130
+ ]
131
+
132
+ pairs.each do |query_str, expected|
133
+ do_test_query_parse_exception_raised(query_str)
134
+ assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
110
135
  end
111
136
  end
112
137
  end
@@ -46,6 +46,15 @@ class IndexSearcherTest < Test::Unit::TestCase
46
46
  end
47
47
  end
48
48
 
49
+ def check_docs(query, options, expected=[])
50
+ top_docs = @is.search(query, options)
51
+ docs = top_docs.score_docs
52
+ assert_equal(expected.length, docs.length)
53
+ docs.length.times do |i|
54
+ assert_equal(expected[i], docs[i].doc)
55
+ end
56
+ end
57
+
49
58
  def test_get_doc()
50
59
  assert_equal(18, @is.max_doc)
51
60
  assert_equal("20050930", @is.doc(0).values(:date))
@@ -57,15 +66,38 @@ class IndexSearcherTest < Test::Unit::TestCase
57
66
  tq.boost = 100
58
67
  check_hits(tq, [1,4,8])
59
68
 
69
+ tq = TermQuery.new(Term.new("field", ""));
70
+ check_hits(tq, [])
71
+
60
72
  tq = TermQuery.new(Term.new("field", "word1"));
61
73
  top_docs = @is.search(tq)
62
- #puts top_docs.score_docs
63
74
  assert_equal(@documents.size, top_docs.total_hits)
64
75
  assert_equal(10, top_docs.score_docs.size)
65
76
  top_docs = @is.search(tq, {:num_docs => 20})
66
77
  assert_equal(@documents.size, top_docs.score_docs.size)
67
78
  end
68
79
 
80
+
81
+ def test_first_doc
82
+ tq = TermQuery.new(Term.new("field", "word1"));
83
+ tq.boost = 100
84
+ top_docs = @is.search(tq, {:num_docs => 100})
85
+ expected = []
86
+ top_docs.score_docs.each do |score_doc|
87
+ expected << score_doc.doc
88
+ end
89
+
90
+ assert_raise(ArgumentError) { @is.search(tq, {:first_doc => -1}) }
91
+ assert_raise(ArgumentError) { @is.search(tq, {:num_docs => 0}) }
92
+ assert_raise(ArgumentError) { @is.search(tq, {:num_docs => -1}) }
93
+
94
+ check_docs(tq, {:num_docs => 8, :first_doc => 0}, expected[0,8])
95
+ check_docs(tq, {:num_docs => 3, :first_doc => 1}, expected[1,3])
96
+ check_docs(tq, {:num_docs => 6, :first_doc => 2}, expected[2,6])
97
+ check_docs(tq, {:num_docs => 2, :first_doc => expected.length}, [])
98
+ check_docs(tq, {:num_docs => 2, :first_doc => expected.length + 100}, [])
99
+ end
100
+
69
101
  def test_boolean_query
70
102
  bq = BooleanQuery.new()
71
103
  tq1 = TermQuery.new(Term.new("field", "word1"))
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: ferret
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.1
7
- date: 2005-11-14 00:00:00 +09:00
6
+ version: 0.2.2
7
+ date: 2005-11-22 00:00:00 +09:00
8
8
  summary: Ruby indexing library.
9
9
  require_paths:
10
10
  - lib