ferret 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.2.1'
25
+ VERSION = '0.2.2'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -32,9 +32,8 @@ module Ferret::Analysis
32
32
  # An array containing some common English words that are not usually useful
33
33
  # for searching.
34
34
  ENGLISH_STOP_WORDS = [
35
- "a", "an", "and", "are", "as", "at", "be", "but", "by",
36
- "for", "if", "in", "into", "is", "it",
37
- "no", "not", "of", "on", "or", "s", "such",
35
+ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
36
+ "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such",
38
37
  "t", "that", "the", "their", "then", "there", "these",
39
38
  "they", "this", "to", "was", "will", "with"
40
39
  ]
@@ -51,6 +50,8 @@ module Ferret::Analysis
51
50
  end
52
51
 
53
52
  # An Analyzer that filters LetterTokenizer with LowerCaseFilter.
53
+ # This analyzer subclasses the StopAnalyzer so you can add your own
54
+ # stoplist the same way. See StopAnalyzer.
54
55
  class StandardAnalyzer < StopAnalyzer
55
56
  def token_stream(field, string)
56
57
  return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
@@ -84,7 +85,7 @@ module Ferret::Analysis
84
85
  def token_stream(field, string)
85
86
  analyzer = @analyzers[field]
86
87
  if (analyzer == nil)
87
- analyzer = @default_analyzer;
88
+ analyzer = @default_analyzer
88
89
  end
89
90
 
90
91
  return analyzer.token_stream(field, string)
@@ -277,28 +277,15 @@ module Ferret::Document
277
277
  str = ""
278
278
  if (@stored)
279
279
  str << "stored"
280
- @str << @compressed ? "/compressed," : "/uncompressed,"
280
+ str << (@compressed ? "/compressed," : "/uncompressed,")
281
281
  end
282
- if (@indexed) then str << "indexed," end
283
- if (@tokenized) then str << "tokenized," end
284
- if (@store_term_vector) then str << "store_term_vector," end
285
- if (@store_offset)
286
- str << "term_vector_offsets,"
287
- end
288
- if (@store_position)
289
- str << "term_vector_position,"
290
- end
291
- if (@binary) then str << "binary," end
292
-
293
- str << '<'
294
- str << @name
295
- str << ':'
296
-
297
- if (@data != null)
298
- str << @data.to_s
299
- end
300
-
301
- str << '>'
282
+ str << "indexed," if (@indexed)
283
+ str << "tokenized," if (@tokenized)
284
+ str << "store_term_vector," if (@store_term_vector)
285
+ str << "tv_offset," if (@store_offset)
286
+ str << "tv_position," if (@store_position)
287
+ str << "binary," if (@binary)
288
+ str << "<#{@name}:#{data}>"
302
289
  end
303
290
  end
304
291
  end
@@ -107,10 +107,10 @@ module Ferret::Index
107
107
  end
108
108
 
109
109
  # Not implemented
110
- def delete(name) raise(UnsupportedOperationError) end
110
+ def remove(name) raise(NotImplementedError) end
111
111
 
112
112
  # Not implemented
113
- def rename(from, to) raise(UnsupportedOperationError) end
113
+ def rename(from, to) raise(NotImplementedError) end
114
114
 
115
115
  # Returns the length of a file in the directory.
116
116
  def length(name)
@@ -120,10 +120,10 @@ module Ferret::Index
120
120
  end
121
121
 
122
122
  # Not implemented
123
- def create_output(name) raise(UnsupportedOperationError) end
123
+ def create_output(name) raise(NotImplementedError) end
124
124
 
125
125
  # Not implemented
126
- def make_lock(name) raise(UnsupportedOperationError) end
126
+ def make_lock(name) raise(NotImplementedError) end
127
127
 
128
128
  # Implementation of an IndexInput that reads from a portion of the
129
129
  # compound file.
@@ -206,8 +206,8 @@ module Ferret::Index
206
206
  # Add a source stream. _file_name_ is the string by which the
207
207
  # sub-stream will be known in the compound stream.
208
208
  #
209
- # Throws:: StateError if this writer is closed
210
- # Throws:: ArgumentError if a file with the same name
209
+ # Raises:: StateError if this writer is closed
210
+ # Raises:: ArgumentError if a file with the same name
211
211
  # has been added already
212
212
  def add_file(file_name)
213
213
  if @merged
@@ -253,7 +253,7 @@ module Ferret::Index
253
253
  # Remember the positions of directory entries so that we can
254
254
  # adjust the offsets later
255
255
  @file_entries.each do |fe|
256
- fe.directory_offset = os.pos()
256
+ fe.dir_offset = os.pos()
257
257
  os.write_long(0) # for now
258
258
  os.write_string(fe.file_name)
259
259
  end
@@ -267,7 +267,7 @@ module Ferret::Index
267
267
 
268
268
  # Write the data offsets into the directory of the compound stream
269
269
  @file_entries.each do |fe|
270
- os.seek(fe.directory_offset)
270
+ os.seek(fe.dir_offset)
271
271
  os.write_long(fe.data_offset)
272
272
  end
273
273
 
@@ -292,15 +292,7 @@ module Ferret::Index
292
292
  private
293
293
 
294
294
  # Internal class for holding a file
295
- class FileEntry
296
-
297
- attr_accessor :file_name, :directory_offset, :data_offset
298
-
299
- def initialize(file_name)
300
- @file_name = file_name
301
- end
302
-
303
- end
295
+ FileEntry = Struct.new(:file_name, :dir_offset, :data_offset)
304
296
 
305
297
  # Copy the contents of the file with specified extension into the
306
298
  # provided output stream. Use a buffer for moving data
@@ -324,9 +316,9 @@ module Ferret::Index
324
316
  # Verify that remainder is 0
325
317
  if (remainder != 0)
326
318
  raise(IOError,
327
- "Non-zero remainder length after copying: " + remainder.to_s +
328
- " (id: " + source.file_name + ", length: " + length.to_s +
329
- ", buffer size: " + Ferret::Store::BUFFER_SIZE.to_s + ")")
319
+ "Non-zero remainder length after copying: #{remainder} " +
320
+ "(id: #{source.file_name}, length: #{length}, buffer size: " +
321
+ " #{Ferret::Store::BUFFER_SIZE})")
330
322
  end
331
323
 
332
324
  # Verify that the output length diff is equal to original file
@@ -334,8 +326,8 @@ module Ferret::Index
334
326
  diff = end_ptr - start_ptr
335
327
  if (diff != length)
336
328
  raise(IOError,
337
- "Difference in the output file offsets " + diff.to_s +
338
- " does not match the original file length " + length.to_s)
329
+ "Difference in the output file offsets #{diff}" +
330
+ " does not match the original file length #{length}")
339
331
  end
340
332
 
341
333
  ensure
@@ -76,6 +76,23 @@ module Ferret::Index
76
76
  # be replaced by the new object. This will slow
77
77
  # down indexing so it should not be used if
78
78
  # performance is a concern.
79
+ # use_compound_file:: Uses a compound file to store the index. This
80
+ # prevents an error being raised for having too
81
+ # many files open at the same time. The default is
82
+ # true but performance is better if this is set to
83
+ # false.
84
+ # handle_parse_errors:: Set this to true if you want the QueryParser to
85
+ # degrade gracefully on errors. If the query parser
86
+ # fails to parse this query, it will try to parse
87
+ # it as a straight boolean query on the default
88
+ # field ignoring all query punctuation. If this
89
+ # fails, it will return an empty TermQuery. If you
90
+ # use this and you need to know why your query
91
+ # isn't working you can use the Query#to_s method
92
+ # on the query returned to see what is happening to
93
+ # your query. This defualts to true. If you set it
94
+ # to false a QueryParseException is raised on a
95
+ # query parse error.
79
96
  #
80
97
  # Some examples;
81
98
  #
@@ -86,7 +103,8 @@ module Ferret::Index
86
103
  #
87
104
  # index = Index::Index.new(:dir => directory,
88
105
  # :close_dir => false
89
- # :default_slop => 2)
106
+ # :default_slop => 2,
107
+ # :handle_parse_errors => false)
90
108
  #
91
109
  def initialize(options = {})
92
110
  super()
@@ -117,6 +135,7 @@ module Ferret::Index
117
135
  @default_search_field = (@options[:default_search_field] || \
118
136
  @options[:default_field] || "*")
119
137
  @default_field = @options[:default_field] || ""
138
+ @options[:handle_parse_errors] = true if @options[:handle_parse_errors].nil?
120
139
  @open = true
121
140
  @qp = nil
122
141
  end
@@ -100,7 +100,7 @@ module Ferret::Index
100
100
  if directory.nil?
101
101
  directory = Ferret::Store::RAMDirectory.new
102
102
  elsif directory.is_a?(String)
103
- directory = Ferret::Store::FSDirectory.new(directory, true)
103
+ directory = Ferret::Store::FSDirectory.new(directory, false)
104
104
  end
105
105
  directory.synchronize do # in- & inter-process sync
106
106
  commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
@@ -83,21 +83,21 @@ module Index
83
83
  @close_dir = options[:close_dir] || false
84
84
  @use_compound_file = (options[:use_compound_file] != false) # ie default true
85
85
  @analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
86
- @merge_factor = DEFAULT_MERGE_FACTOR
87
- @min_merge_docs = DEFAULT_MIN_MERGE_DOCS
88
- @max_merge_docs = DEFAULT_MAX_MERGE_DOCS
89
- @max_field_length = DEFAULT_MAX_FIELD_LENGTH
90
- @term_index_interval = DEFAULT_TERM_INDEX_INTERVAL
86
+ @merge_factor = options[:merge_factor] || DEFAULT_MERGE_FACTOR
87
+ @min_merge_docs = options[:min_merge_docs] || DEFAULT_MIN_MERGE_DOCS
88
+ @max_merge_docs = options[:max_merge_docs] || DEFAULT_MAX_MERGE_DOCS
89
+ @max_field_length = options[:max_field_length] || DEFAULT_MAX_FIELD_LENGTH
90
+ @term_index_interval = options[:term_index_interval] || DEFAULT_TERM_INDEX_INTERVAL
91
91
 
92
92
  @similarity = Search::Similarity.default
93
93
  @segment_infos = SegmentInfos.new()
94
94
  @ram_directory = Ferret::Store::RAMDirectory.new()
95
95
 
96
96
  # Make sure that the lock is released when this object is destroyed
97
- define_finalizer(self, proc { |id| @write_lock.release() if @write_lock})
98
97
 
99
98
  @write_lock = @directory.make_lock(WRITE_LOCK_NAME)
100
99
  @write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
100
+ define_finalizer(@write_lock, proc { |id| @write_lock.release() if @write_lock})
101
101
 
102
102
  @directory.synchronize() do # in- & inter-process sync
103
103
  @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
@@ -16,16 +16,17 @@ module Ferret::Index
16
16
  @segment = info.name
17
17
 
18
18
  @cfs_reader = nil
19
- cfs = directory
20
- if directory.exists?(@segment + '.cfs') then
19
+ dir = directory
20
+ #if directory.exists?(@segment + '.cfs') then
21
+ if SegmentReader.uses_compound_file?(info)
21
22
  @cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
22
- cfs = @cfs_reader
23
+ dir = @cfs_reader
23
24
  end
24
25
 
25
- @field_infos = FieldInfos.new(cfs, @segment + '.fnm')
26
- @fields_reader = FieldsReader.new(cfs, @segment, @field_infos)
26
+ @field_infos = FieldInfos.new(dir, @segment + '.fnm')
27
+ @fields_reader = FieldsReader.new(dir, @segment, @field_infos)
27
28
 
28
- @term_infos = TermInfosReader.new(cfs, @segment, @field_infos)
29
+ @term_infos = TermInfosReader.new(dir, @segment, @field_infos)
29
30
  @deleted_docs = nil
30
31
  @deleted_docs_dirty = false
31
32
  if SegmentReader.has_deletions?(info) then
@@ -33,16 +34,16 @@ module Ferret::Index
33
34
  Ferret::Utils::BitVector.read(directory, @segment + '.del')
34
35
  end
35
36
 
36
- @freq_stream = cfs.open_input(@segment + '.frq')
37
- @prox_stream = cfs.open_input(@segment + '.prx')
37
+ @freq_stream = dir.open_input(@segment + '.frq')
38
+ @prox_stream = dir.open_input(@segment + '.prx')
38
39
  @norms = {}
39
40
  @norms.extend(MonitorMixin)
40
41
  @norms_dirty = false
41
- open_norms(cfs)
42
+ open_norms(dir)
42
43
 
43
44
  @tv_reader_orig = nil
44
45
  if @field_infos.has_vectors? then
45
- @tv_reader_orig = TermVectorsReader.new(cfs, @segment, @field_infos)
46
+ @tv_reader_orig = TermVectorsReader.new(dir, @segment, @field_infos)
46
47
  end
47
48
  end
48
49
 
@@ -128,9 +129,9 @@ module Ferret::Index
128
129
  @field_infos.each_with_index do |fi, i|
129
130
  if (fi.indexed?)
130
131
  if @cfs_reader.nil?
131
- name = @segment + ".f" + i.to_s
132
+ name = "#{@segment}.f#{i}"
132
133
  else
133
- name = @segment + ".s" + i.to_s
134
+ name = "#{@segment}.s#{i}"
134
135
  end
135
136
  if (@directory.exists?(name))
136
137
  file_names << name
@@ -242,17 +242,29 @@ module Ferret
242
242
  #
243
243
  # === Options
244
244
  #
245
- # analyzer:: The analyzer is used to break phrases up into terms and
246
- # to turn terms in tokens recognized in the index.
247
- # Analysis::Analyzer is the default
248
- # occur_default:: Set to either BooleanClause::Occur::SHOULD (default)
249
- # or BooleanClause::Occur::MUST to specify the default
250
- # Occur operator.
251
- # wild_lower:: Set to false if you don't want the terms in fuzzy and
252
- # wild queries to be set to lower case. You should do this
253
- # if your analyzer doesn't downcase. The default is true.
254
- # default_slop:: Set the default slop for phrase queries. This defaults
255
- # to 0.
245
+ # analyzer:: The analyzer is used to break phrases up into
246
+ # terms and to turn terms in tokens recognized in
247
+ # the index. Analysis::Analyzer is the default
248
+ # occur_default:: Set to either BooleanClause::Occur::SHOULD
249
+ # (default) or BooleanClause::Occur::MUST to specify
250
+ # the default Occur operator.
251
+ # wild_lower:: Set to false if you don't want the terms in fuzzy
252
+ # and wild queries to be set to lower case. You
253
+ # should do this if your analyzer doesn't downcase.
254
+ # The default is true.
255
+ # default_slop:: Set the default slop for phrase queries. This
256
+ # defaults to 0.
257
+ # handle_parse_errors:: Set this to true if you want the QueryParser to
258
+ # degrade gracefully on errors. If the query parser
259
+ # fails to parse this query, it will try to parse it
260
+ # as a straight boolean query on the default field
261
+ # ignoring all query punctuation. If this fails, it
262
+ # will return an empty TermQuery. If you use this
263
+ # and you need to know why your query isn't working
264
+ # you can use the Query#to_s method on the query
265
+ # returned to see what is happening to your query.
266
+ # This defualts to false, in which case a
267
+ # QueryParseException is thrown.
256
268
  def initialize(default_field = "", options = {})
257
269
  end
258
270
 
@@ -263,10 +275,10 @@ module Ferret
263
275
 
264
276
  # Set to false if you don't want the terms in fuzzy and wild queries to be
265
277
  # set to lower case. You should do this if your analyzer doesn't downcase.
266
- def wild_lower()
278
+ def wild_lower=()
267
279
  end
268
280
 
269
- # Returns the value of wild_lower. See #wild_lower.
281
+ # Returns the value of wild_lower. See #wild_lower=.
270
282
  def wild_lower?()
271
283
  end
272
284
 
@@ -276,7 +288,25 @@ module Ferret
276
288
  # if you'd like to do your own query string cleaning.
277
289
  def clean_string(str)
278
290
  end
291
+
292
+ # The exception thrown when there is an error parsing the query string.
293
+ # This also holds the Racc::ParseError that was thrown in case you want to
294
+ # investigate why a query won't parse.
295
+ class QueryParseException < Exception
296
+ attr_reader :parse_error
297
+
298
+ # Create a new QueryParseException
299
+ #
300
+ # error:: An error string describing the query that failed
301
+ # parse_error:: The actual parse error that was thrown by Racc. It is a
302
+ # Racc::ParseError object.
303
+ def initialize(error, parse_error)
304
+ super(error)
305
+ @parse_error = parse_error
306
+ end
307
+ end
279
308
  end
309
+
280
310
  end
281
311
 
282
312
  require 'ferret/query_parser/query_parser.tab.rb'
@@ -11,15 +11,8 @@ module Ferret
11
11
 
12
12
  class QueryParser < Racc::Parser
13
13
 
14
- module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d44076', 'lib/ferret/query_parser/query_parser.y', 126
15
- attr_accessor :default_field, :fields
16
-
17
- # true if you want to downcase wild card queries. This is set to try by
18
- # default.
19
- attr_writer :wild_lower
20
-
21
- def wild_lower?() @wild_lower end
22
-
14
+ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492', 'lib/ferret/query_parser/query_parser.y', 126
15
+ attr_accessor :default_field, :fields, :handle_parse_errors
23
16
 
24
17
  def initialize(default_field = "*", options = {})
25
18
  @yydebug = true
@@ -32,6 +25,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
32
25
  @occur_default = options[:occur_default] || BooleanClause::Occur::SHOULD
33
26
  @default_slop = options[:default_slop] || 0
34
27
  @fields = options[:fields]||[]
28
+ @handle_parse_errors = options[:handle_parse_errors] || false
35
29
  end
36
30
 
37
31
  RESERVED = {
@@ -50,6 +44,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
50
44
  EWCHR = %q,:()\[\]{}!+"~^\-\|<>\=,
51
45
 
52
46
  def parse(str)
47
+ orig_str = str
53
48
  str = clean_string(str)
54
49
  str.strip!
55
50
  @q = []
@@ -82,10 +77,24 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
82
77
  end
83
78
  str = $'
84
79
  end
85
- @q.push [ false, '$' ]
80
+ if @q.empty?
81
+ return TermQuery.new(Term.new(@default_field, ""))
82
+ end
83
+
84
+ @q.push([ false, '$' ])
86
85
  #p @q
87
86
 
88
- do_parse
87
+ begin
88
+ query = do_parse
89
+ rescue Racc::ParseError => e
90
+ if @handle_parse_errors
91
+ @field = @default_field
92
+ query = _get_bad_query(orig_str)
93
+ else
94
+ raise QueryParseException.new("Could not parse #{str}", e)
95
+ end
96
+ end
97
+ return query
89
98
  end
90
99
 
91
100
  def next_token
@@ -160,6 +169,25 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
160
169
  return new_str.pack("c*")
161
170
  end
162
171
 
172
+ def get_bad_query(field, str)
173
+ tokens = []
174
+ stream = @analyzer.token_stream(field, str)
175
+ while token = stream.next
176
+ tokens << token
177
+ end
178
+ if tokens.length == 0
179
+ return TermQuery.new(Term.new(field, ""))
180
+ elsif tokens.length == 1
181
+ return TermQuery.new(Term.new(field, tokens[0].term_text))
182
+ else
183
+ bq = BooleanQuery.new()
184
+ tokens.each do |token|
185
+ bq << BooleanClause.new(TermQuery.new(Term.new(field, token.term_text)))
186
+ end
187
+ return bq
188
+ end
189
+ end
190
+
163
191
  def get_range_query(field, start_word, end_word, inc_upper, inc_lower)
164
192
  RangeQuery.new(field, start_word, end_word, inc_upper, inc_lower)
165
193
  end
@@ -374,7 +402,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
374
402
  return qp.parse(query)
375
403
  end
376
404
 
377
- ..end lib/ferret/query_parser/query_parser.y modeval..id9e08d44076
405
+ ..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492
378
406
 
379
407
  ##### racc 1.4.4 generates ###
380
408
 
@@ -893,7 +921,8 @@ if __FILE__ == $0
893
921
 
894
922
  parser = Ferret::QueryParser.new("default",
895
923
  :fields => ["f1", "f2", "f3"],
896
- :analyzer => Ferret::Analysis::StandardAnalyzer.new)
924
+ :analyzer => Ferret::Analysis::StandardAnalyzer.new,
925
+ :handle_parse_errors => true)
897
926
 
898
927
  $stdin.each do |line|
899
928
  query = parser.parse(line)
@@ -90,12 +90,17 @@ module Ferret::Search
90
90
  filter = options[:filter]
91
91
  first_doc = options[:first_doc]||0
92
92
  num_docs = options[:num_docs]||10
93
+ max_size = first_doc + num_docs
93
94
  sort = options[:sort]
94
95
 
95
- if (num_docs <= 0) # nil might be returned from hq.top() below.
96
+ if (num_docs <= 0)
96
97
  raise ArgumentError, "num_docs must be > 0 to run a search"
97
98
  end
98
99
 
100
+ if (first_doc < 0)
101
+ raise ArgumentError, "first_doc must be >= 0 to run a search"
102
+ end
103
+
99
104
  scorer = query.weight(self).scorer(@reader)
100
105
  if (scorer == nil)
101
106
  return TopDocs.new(0, [])
@@ -104,33 +109,32 @@ module Ferret::Search
104
109
  bits = (filter.nil? ? nil : filter.bits(@reader))
105
110
  if (sort)
106
111
  fields = sort.is_a?(Array) ? sort : sort.fields
107
- hq = FieldSortedHitQueue.new(@reader, fields, num_docs + first_doc)
112
+ hq = FieldSortedHitQueue.new(@reader, fields, max_size)
108
113
  else
109
- hq = HitQueue.new(num_docs + first_doc)
114
+ hq = HitQueue.new(max_size)
110
115
  end
111
116
  total_hits = 0
112
117
  min_score = 0.0
113
118
  scorer.each_hit() do |doc, score|
114
119
  if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
115
120
  total_hits += 1
116
- if hq.size < num_docs or score >= min_score
121
+ if hq.size < max_size or score >= min_score
117
122
  hq.insert(ScoreDoc.new(doc, score))
118
123
  min_score = hq.top.score # maintain min_score
119
124
  end
120
125
  end
121
126
  end
122
127
 
123
- score_docs = Array.new(hq.size)
128
+ score_docs = []
124
129
  if (hq.size > first_doc)
125
- score_docs = Array.new(hq.size - first_doc)
126
- first_doc.times { hq.pop }
127
- (hq.size - 1).downto(0) do |i|
128
- score_docs[i] = hq.pop
130
+ if (hq.size - first_doc) < num_docs
131
+ num_docs = hq.size - first_doc
132
+ end
133
+ num_docs.times do
134
+ score_docs.unshift(hq.pop)
129
135
  end
130
- else
131
- score_docs = []
132
- hq.clear
133
136
  end
137
+ hq.clear
134
138
 
135
139
  return TopDocs.new(total_hits, score_docs)
136
140
  end
@@ -25,8 +25,6 @@ module Ferret::Search
25
25
  @weight = weight
26
26
  @term_docs = td
27
27
  @norms = norms
28
- #XXX
29
- @norms_size = @norms.size
30
28
  @weight_value = weight.value
31
29
 
32
30
  SCORE_CACHE_SIZE.times do |i|
@@ -37,13 +37,15 @@ class FieldTest < Test::Unit::TestCase
37
37
  assert_equal(false, f.store_offsets?)
38
38
  assert_equal(false, f.store_positions?)
39
39
  assert_equal(false, f.binary?)
40
+ assert_equal("stored/compressed,indexed,tokenized,<name:value>", f.to_s)
40
41
  end
41
42
 
42
43
  def test_set_store()
43
- f = Field.new("name", "value", Field::Store::COMPRESS, Field::Index::TOKENIZED)
44
+ f = Field.new("name", nil, Field::Store::COMPRESS, Field::Index::TOKENIZED)
44
45
  f.stored = Field::Store::NO
45
46
  assert_equal(false, f.stored?)
46
47
  assert_equal(false, f.compressed?)
48
+ assert_equal("indexed,tokenized,<name:>", f.to_s)
47
49
  end
48
50
 
49
51
  def test_set_index()
@@ -51,6 +53,7 @@ class FieldTest < Test::Unit::TestCase
51
53
  f.index = Field::Index::NO
52
54
  assert_equal(false, f.indexed?)
53
55
  assert_equal(false, f.tokenized?)
56
+ assert_equal("stored/compressed,<name:value>", f.to_s)
54
57
  end
55
58
 
56
59
  def test_set_term_vector()
@@ -59,6 +62,7 @@ class FieldTest < Test::Unit::TestCase
59
62
  assert_equal(true, f.store_term_vector?)
60
63
  assert_equal(true, f.store_offsets?)
61
64
  assert_equal(true, f.store_positions?)
65
+ assert_equal("stored/compressed,indexed,tokenized,store_term_vector,tv_offset,tv_position,<name:value>", f.to_s)
62
66
  end
63
67
 
64
68
  def test_new_binary_field()
@@ -76,5 +80,6 @@ class FieldTest < Test::Unit::TestCase
76
80
  assert_equal(false, f.store_offsets?)
77
81
  assert_equal(false, f.store_positions?)
78
82
  assert_equal(true, f.binary?)
83
+ assert_equal("stored/uncompressed,binary,<name:#{bin}>", f.to_s)
79
84
  end
80
85
  end
@@ -5,6 +5,7 @@ class QueryParserTest < Test::Unit::TestCase
5
5
  def test_strings()
6
6
  parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2", "f3"])
7
7
  pairs = [
8
+ ['', ''],
8
9
  ['word', 'word'],
9
10
  ['field:word', 'field:word'],
10
11
  ['"word1 word2 word3"', '"word word word"'],
@@ -92,8 +93,8 @@ class QueryParserTest < Test::Unit::TestCase
92
93
  ['"onewordphrase"', 'onewordphrase']
93
94
  ]
94
95
 
95
- pairs.each do |pair|
96
- assert_equal(pair[1], parser.parse(pair[0]).to_s(parser.default_field))
96
+ pairs.each do |query_str, expected|
97
+ assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
97
98
  end
98
99
  end
99
100
 
@@ -105,8 +106,32 @@ class QueryParserTest < Test::Unit::TestCase
105
106
  ['key:(1234)', 'key:1234']
106
107
  ]
107
108
 
108
- pairs.each do |pair|
109
- assert_equal(pair[1], parser.parse(pair[0]).to_s(parser.default_field))
109
+ pairs.each do |query_str, expected|
110
+ assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
111
+ end
112
+ end
113
+
114
+ def do_test_query_parse_exception_raised(str)
115
+ parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2", "f3"])
116
+ assert_raise(Ferret::QueryParser::QueryParseException) do
117
+ parser.parse(str)
118
+ end
119
+ end
120
+
121
+
122
+ def test_bad_queries
123
+ parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2"],
124
+ :handle_parse_errors => true)
125
+
126
+ pairs = [
127
+ ['(*word', 'word'],
128
+ ['()*&)(*^&*(', ''],
129
+ ['()*&one)(*two(*&"', 'one two']
130
+ ]
131
+
132
+ pairs.each do |query_str, expected|
133
+ do_test_query_parse_exception_raised(query_str)
134
+ assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
110
135
  end
111
136
  end
112
137
  end
@@ -46,6 +46,15 @@ class IndexSearcherTest < Test::Unit::TestCase
46
46
  end
47
47
  end
48
48
 
49
+ def check_docs(query, options, expected=[])
50
+ top_docs = @is.search(query, options)
51
+ docs = top_docs.score_docs
52
+ assert_equal(expected.length, docs.length)
53
+ docs.length.times do |i|
54
+ assert_equal(expected[i], docs[i].doc)
55
+ end
56
+ end
57
+
49
58
  def test_get_doc()
50
59
  assert_equal(18, @is.max_doc)
51
60
  assert_equal("20050930", @is.doc(0).values(:date))
@@ -57,15 +66,38 @@ class IndexSearcherTest < Test::Unit::TestCase
57
66
  tq.boost = 100
58
67
  check_hits(tq, [1,4,8])
59
68
 
69
+ tq = TermQuery.new(Term.new("field", ""));
70
+ check_hits(tq, [])
71
+
60
72
  tq = TermQuery.new(Term.new("field", "word1"));
61
73
  top_docs = @is.search(tq)
62
- #puts top_docs.score_docs
63
74
  assert_equal(@documents.size, top_docs.total_hits)
64
75
  assert_equal(10, top_docs.score_docs.size)
65
76
  top_docs = @is.search(tq, {:num_docs => 20})
66
77
  assert_equal(@documents.size, top_docs.score_docs.size)
67
78
  end
68
79
 
80
+
81
+ def test_first_doc
82
+ tq = TermQuery.new(Term.new("field", "word1"));
83
+ tq.boost = 100
84
+ top_docs = @is.search(tq, {:num_docs => 100})
85
+ expected = []
86
+ top_docs.score_docs.each do |score_doc|
87
+ expected << score_doc.doc
88
+ end
89
+
90
+ assert_raise(ArgumentError) { @is.search(tq, {:first_doc => -1}) }
91
+ assert_raise(ArgumentError) { @is.search(tq, {:num_docs => 0}) }
92
+ assert_raise(ArgumentError) { @is.search(tq, {:num_docs => -1}) }
93
+
94
+ check_docs(tq, {:num_docs => 8, :first_doc => 0}, expected[0,8])
95
+ check_docs(tq, {:num_docs => 3, :first_doc => 1}, expected[1,3])
96
+ check_docs(tq, {:num_docs => 6, :first_doc => 2}, expected[2,6])
97
+ check_docs(tq, {:num_docs => 2, :first_doc => expected.length}, [])
98
+ check_docs(tq, {:num_docs => 2, :first_doc => expected.length + 100}, [])
99
+ end
100
+
69
101
  def test_boolean_query
70
102
  bq = BooleanQuery.new()
71
103
  tq1 = TermQuery.new(Term.new("field", "word1"))
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: ferret
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.1
7
- date: 2005-11-14 00:00:00 +09:00
6
+ version: 0.2.2
7
+ date: 2005-11-22 00:00:00 +09:00
8
8
  summary: Ruby indexing library.
9
9
  require_paths:
10
10
  - lib