ferret 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/analyzers.rb +5 -4
- data/lib/ferret/document/field.rb +8 -21
- data/lib/ferret/index/compound_file_io.rb +14 -22
- data/lib/ferret/index/index.rb +20 -1
- data/lib/ferret/index/index_reader.rb +1 -1
- data/lib/ferret/index/index_writer.rb +6 -6
- data/lib/ferret/index/segment_reader.rb +13 -12
- data/lib/ferret/query_parser.rb +43 -13
- data/lib/ferret/query_parser/query_parser.tab.rb +42 -13
- data/lib/ferret/search/index_searcher.rb +16 -12
- data/lib/ferret/search/term_scorer.rb +0 -2
- data/test/unit/document/tc_field.rb +6 -1
- data/test/unit/query_parser/tc_query_parser.rb +29 -4
- data/test/unit/search/tc_index_searcher.rb +33 -1
- metadata +2 -2
data/lib/ferret.rb
CHANGED
@@ -32,9 +32,8 @@ module Ferret::Analysis
|
|
32
32
|
# An array containing some common English words that are not usually useful
|
33
33
|
# for searching.
|
34
34
|
ENGLISH_STOP_WORDS = [
|
35
|
-
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
36
|
-
"
|
37
|
-
"no", "not", "of", "on", "or", "s", "such",
|
35
|
+
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
|
36
|
+
"in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such",
|
38
37
|
"t", "that", "the", "their", "then", "there", "these",
|
39
38
|
"they", "this", "to", "was", "will", "with"
|
40
39
|
]
|
@@ -51,6 +50,8 @@ module Ferret::Analysis
|
|
51
50
|
end
|
52
51
|
|
53
52
|
# An Analyzer that filters LetterTokenizer with LowerCaseFilter.
|
53
|
+
# This analyzer subclasses the StopAnalyzer so you can add your own
|
54
|
+
# stoplist the same way. See StopAnalyzer.
|
54
55
|
class StandardAnalyzer < StopAnalyzer
|
55
56
|
def token_stream(field, string)
|
56
57
|
return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
|
@@ -84,7 +85,7 @@ module Ferret::Analysis
|
|
84
85
|
def token_stream(field, string)
|
85
86
|
analyzer = @analyzers[field]
|
86
87
|
if (analyzer == nil)
|
87
|
-
analyzer = @default_analyzer
|
88
|
+
analyzer = @default_analyzer
|
88
89
|
end
|
89
90
|
|
90
91
|
return analyzer.token_stream(field, string)
|
@@ -277,28 +277,15 @@ module Ferret::Document
|
|
277
277
|
str = ""
|
278
278
|
if (@stored)
|
279
279
|
str << "stored"
|
280
|
-
|
280
|
+
str << (@compressed ? "/compressed," : "/uncompressed,")
|
281
281
|
end
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
if (@store_offset)
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
str << "term_vector_position,"
|
290
|
-
end
|
291
|
-
if (@binary) then str << "binary," end
|
292
|
-
|
293
|
-
str << '<'
|
294
|
-
str << @name
|
295
|
-
str << ':'
|
296
|
-
|
297
|
-
if (@data != null)
|
298
|
-
str << @data.to_s
|
299
|
-
end
|
300
|
-
|
301
|
-
str << '>'
|
282
|
+
str << "indexed," if (@indexed)
|
283
|
+
str << "tokenized," if (@tokenized)
|
284
|
+
str << "store_term_vector," if (@store_term_vector)
|
285
|
+
str << "tv_offset," if (@store_offset)
|
286
|
+
str << "tv_position," if (@store_position)
|
287
|
+
str << "binary," if (@binary)
|
288
|
+
str << "<#{@name}:#{data}>"
|
302
289
|
end
|
303
290
|
end
|
304
291
|
end
|
@@ -107,10 +107,10 @@ module Ferret::Index
|
|
107
107
|
end
|
108
108
|
|
109
109
|
# Not implemented
|
110
|
-
def
|
110
|
+
def remove(name) raise(NotImplementedError) end
|
111
111
|
|
112
112
|
# Not implemented
|
113
|
-
def rename(from, to) raise(
|
113
|
+
def rename(from, to) raise(NotImplementedError) end
|
114
114
|
|
115
115
|
# Returns the length of a file in the directory.
|
116
116
|
def length(name)
|
@@ -120,10 +120,10 @@ module Ferret::Index
|
|
120
120
|
end
|
121
121
|
|
122
122
|
# Not implemented
|
123
|
-
def create_output(name) raise(
|
123
|
+
def create_output(name) raise(NotImplementedError) end
|
124
124
|
|
125
125
|
# Not implemented
|
126
|
-
def make_lock(name) raise(
|
126
|
+
def make_lock(name) raise(NotImplementedError) end
|
127
127
|
|
128
128
|
# Implementation of an IndexInput that reads from a portion of the
|
129
129
|
# compound file.
|
@@ -206,8 +206,8 @@ module Ferret::Index
|
|
206
206
|
# Add a source stream. _file_name_ is the string by which the
|
207
207
|
# sub-stream will be known in the compound stream.
|
208
208
|
#
|
209
|
-
#
|
210
|
-
#
|
209
|
+
# Raises:: StateError if this writer is closed
|
210
|
+
# Raises:: ArgumentError if a file with the same name
|
211
211
|
# has been added already
|
212
212
|
def add_file(file_name)
|
213
213
|
if @merged
|
@@ -253,7 +253,7 @@ module Ferret::Index
|
|
253
253
|
# Remember the positions of directory entries so that we can
|
254
254
|
# adjust the offsets later
|
255
255
|
@file_entries.each do |fe|
|
256
|
-
fe.
|
256
|
+
fe.dir_offset = os.pos()
|
257
257
|
os.write_long(0) # for now
|
258
258
|
os.write_string(fe.file_name)
|
259
259
|
end
|
@@ -267,7 +267,7 @@ module Ferret::Index
|
|
267
267
|
|
268
268
|
# Write the data offsets into the directory of the compound stream
|
269
269
|
@file_entries.each do |fe|
|
270
|
-
os.seek(fe.
|
270
|
+
os.seek(fe.dir_offset)
|
271
271
|
os.write_long(fe.data_offset)
|
272
272
|
end
|
273
273
|
|
@@ -292,15 +292,7 @@ module Ferret::Index
|
|
292
292
|
private
|
293
293
|
|
294
294
|
# Internal class for holding a file
|
295
|
-
|
296
|
-
|
297
|
-
attr_accessor :file_name, :directory_offset, :data_offset
|
298
|
-
|
299
|
-
def initialize(file_name)
|
300
|
-
@file_name = file_name
|
301
|
-
end
|
302
|
-
|
303
|
-
end
|
295
|
+
FileEntry = Struct.new(:file_name, :dir_offset, :data_offset)
|
304
296
|
|
305
297
|
# Copy the contents of the file with specified extension into the
|
306
298
|
# provided output stream. Use a buffer for moving data
|
@@ -324,9 +316,9 @@ module Ferret::Index
|
|
324
316
|
# Verify that remainder is 0
|
325
317
|
if (remainder != 0)
|
326
318
|
raise(IOError,
|
327
|
-
"Non-zero remainder length after copying: " +
|
328
|
-
|
329
|
-
|
319
|
+
"Non-zero remainder length after copying: #{remainder} " +
|
320
|
+
"(id: #{source.file_name}, length: #{length}, buffer size: " +
|
321
|
+
" #{Ferret::Store::BUFFER_SIZE})")
|
330
322
|
end
|
331
323
|
|
332
324
|
# Verify that the output length diff is equal to original file
|
@@ -334,8 +326,8 @@ module Ferret::Index
|
|
334
326
|
diff = end_ptr - start_ptr
|
335
327
|
if (diff != length)
|
336
328
|
raise(IOError,
|
337
|
-
"Difference in the output file offsets " +
|
338
|
-
|
329
|
+
"Difference in the output file offsets #{diff}" +
|
330
|
+
" does not match the original file length #{length}")
|
339
331
|
end
|
340
332
|
|
341
333
|
ensure
|
data/lib/ferret/index/index.rb
CHANGED
@@ -76,6 +76,23 @@ module Ferret::Index
|
|
76
76
|
# be replaced by the new object. This will slow
|
77
77
|
# down indexing so it should not be used if
|
78
78
|
# performance is a concern.
|
79
|
+
# use_compound_file:: Uses a compound file to store the index. This
|
80
|
+
# prevents an error being raised for having too
|
81
|
+
# many files open at the same time. The default is
|
82
|
+
# true but performance is better if this is set to
|
83
|
+
# false.
|
84
|
+
# handle_parse_errors:: Set this to true if you want the QueryParser to
|
85
|
+
# degrade gracefully on errors. If the query parser
|
86
|
+
# fails to parse this query, it will try to parse
|
87
|
+
# it as a straight boolean query on the default
|
88
|
+
# field ignoring all query punctuation. If this
|
89
|
+
# fails, it will return an empty TermQuery. If you
|
90
|
+
# use this and you need to know why your query
|
91
|
+
# isn't working you can use the Query#to_s method
|
92
|
+
# on the query returned to see what is happening to
|
93
|
+
# your query. This defualts to true. If you set it
|
94
|
+
# to false a QueryParseException is raised on a
|
95
|
+
# query parse error.
|
79
96
|
#
|
80
97
|
# Some examples;
|
81
98
|
#
|
@@ -86,7 +103,8 @@ module Ferret::Index
|
|
86
103
|
#
|
87
104
|
# index = Index::Index.new(:dir => directory,
|
88
105
|
# :close_dir => false
|
89
|
-
# :default_slop => 2
|
106
|
+
# :default_slop => 2,
|
107
|
+
# :handle_parse_errors => false)
|
90
108
|
#
|
91
109
|
def initialize(options = {})
|
92
110
|
super()
|
@@ -117,6 +135,7 @@ module Ferret::Index
|
|
117
135
|
@default_search_field = (@options[:default_search_field] || \
|
118
136
|
@options[:default_field] || "*")
|
119
137
|
@default_field = @options[:default_field] || ""
|
138
|
+
@options[:handle_parse_errors] = true if @options[:handle_parse_errors].nil?
|
120
139
|
@open = true
|
121
140
|
@qp = nil
|
122
141
|
end
|
@@ -100,7 +100,7 @@ module Ferret::Index
|
|
100
100
|
if directory.nil?
|
101
101
|
directory = Ferret::Store::RAMDirectory.new
|
102
102
|
elsif directory.is_a?(String)
|
103
|
-
directory = Ferret::Store::FSDirectory.new(directory,
|
103
|
+
directory = Ferret::Store::FSDirectory.new(directory, false)
|
104
104
|
end
|
105
105
|
directory.synchronize do # in- & inter-process sync
|
106
106
|
commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
|
@@ -83,21 +83,21 @@ module Index
|
|
83
83
|
@close_dir = options[:close_dir] || false
|
84
84
|
@use_compound_file = (options[:use_compound_file] != false) # ie default true
|
85
85
|
@analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
|
86
|
-
@merge_factor = DEFAULT_MERGE_FACTOR
|
87
|
-
@min_merge_docs = DEFAULT_MIN_MERGE_DOCS
|
88
|
-
@max_merge_docs = DEFAULT_MAX_MERGE_DOCS
|
89
|
-
@max_field_length = DEFAULT_MAX_FIELD_LENGTH
|
90
|
-
@term_index_interval = DEFAULT_TERM_INDEX_INTERVAL
|
86
|
+
@merge_factor = options[:merge_factor] || DEFAULT_MERGE_FACTOR
|
87
|
+
@min_merge_docs = options[:min_merge_docs] || DEFAULT_MIN_MERGE_DOCS
|
88
|
+
@max_merge_docs = options[:max_merge_docs] || DEFAULT_MAX_MERGE_DOCS
|
89
|
+
@max_field_length = options[:max_field_length] || DEFAULT_MAX_FIELD_LENGTH
|
90
|
+
@term_index_interval = options[:term_index_interval] || DEFAULT_TERM_INDEX_INTERVAL
|
91
91
|
|
92
92
|
@similarity = Search::Similarity.default
|
93
93
|
@segment_infos = SegmentInfos.new()
|
94
94
|
@ram_directory = Ferret::Store::RAMDirectory.new()
|
95
95
|
|
96
96
|
# Make sure that the lock is released when this object is destroyed
|
97
|
-
define_finalizer(self, proc { |id| @write_lock.release() if @write_lock})
|
98
97
|
|
99
98
|
@write_lock = @directory.make_lock(WRITE_LOCK_NAME)
|
100
99
|
@write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
|
100
|
+
define_finalizer(@write_lock, proc { |id| @write_lock.release() if @write_lock})
|
101
101
|
|
102
102
|
@directory.synchronize() do # in- & inter-process sync
|
103
103
|
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
@@ -16,16 +16,17 @@ module Ferret::Index
|
|
16
16
|
@segment = info.name
|
17
17
|
|
18
18
|
@cfs_reader = nil
|
19
|
-
|
20
|
-
if directory.exists?(@segment + '.cfs') then
|
19
|
+
dir = directory
|
20
|
+
#if directory.exists?(@segment + '.cfs') then
|
21
|
+
if SegmentReader.uses_compound_file?(info)
|
21
22
|
@cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
|
22
|
-
|
23
|
+
dir = @cfs_reader
|
23
24
|
end
|
24
25
|
|
25
|
-
@field_infos = FieldInfos.new(
|
26
|
-
@fields_reader = FieldsReader.new(
|
26
|
+
@field_infos = FieldInfos.new(dir, @segment + '.fnm')
|
27
|
+
@fields_reader = FieldsReader.new(dir, @segment, @field_infos)
|
27
28
|
|
28
|
-
@term_infos = TermInfosReader.new(
|
29
|
+
@term_infos = TermInfosReader.new(dir, @segment, @field_infos)
|
29
30
|
@deleted_docs = nil
|
30
31
|
@deleted_docs_dirty = false
|
31
32
|
if SegmentReader.has_deletions?(info) then
|
@@ -33,16 +34,16 @@ module Ferret::Index
|
|
33
34
|
Ferret::Utils::BitVector.read(directory, @segment + '.del')
|
34
35
|
end
|
35
36
|
|
36
|
-
@freq_stream =
|
37
|
-
@prox_stream =
|
37
|
+
@freq_stream = dir.open_input(@segment + '.frq')
|
38
|
+
@prox_stream = dir.open_input(@segment + '.prx')
|
38
39
|
@norms = {}
|
39
40
|
@norms.extend(MonitorMixin)
|
40
41
|
@norms_dirty = false
|
41
|
-
open_norms(
|
42
|
+
open_norms(dir)
|
42
43
|
|
43
44
|
@tv_reader_orig = nil
|
44
45
|
if @field_infos.has_vectors? then
|
45
|
-
@tv_reader_orig = TermVectorsReader.new(
|
46
|
+
@tv_reader_orig = TermVectorsReader.new(dir, @segment, @field_infos)
|
46
47
|
end
|
47
48
|
end
|
48
49
|
|
@@ -128,9 +129,9 @@ module Ferret::Index
|
|
128
129
|
@field_infos.each_with_index do |fi, i|
|
129
130
|
if (fi.indexed?)
|
130
131
|
if @cfs_reader.nil?
|
131
|
-
name = @segment
|
132
|
+
name = "#{@segment}.f#{i}"
|
132
133
|
else
|
133
|
-
name = @segment
|
134
|
+
name = "#{@segment}.s#{i}"
|
134
135
|
end
|
135
136
|
if (@directory.exists?(name))
|
136
137
|
file_names << name
|
data/lib/ferret/query_parser.rb
CHANGED
@@ -242,17 +242,29 @@ module Ferret
|
|
242
242
|
#
|
243
243
|
# === Options
|
244
244
|
#
|
245
|
-
# analyzer::
|
246
|
-
#
|
247
|
-
#
|
248
|
-
# occur_default::
|
249
|
-
#
|
250
|
-
#
|
251
|
-
# wild_lower::
|
252
|
-
#
|
253
|
-
#
|
254
|
-
#
|
255
|
-
#
|
245
|
+
# analyzer:: The analyzer is used to break phrases up into
|
246
|
+
# terms and to turn terms in tokens recognized in
|
247
|
+
# the index. Analysis::Analyzer is the default
|
248
|
+
# occur_default:: Set to either BooleanClause::Occur::SHOULD
|
249
|
+
# (default) or BooleanClause::Occur::MUST to specify
|
250
|
+
# the default Occur operator.
|
251
|
+
# wild_lower:: Set to false if you don't want the terms in fuzzy
|
252
|
+
# and wild queries to be set to lower case. You
|
253
|
+
# should do this if your analyzer doesn't downcase.
|
254
|
+
# The default is true.
|
255
|
+
# default_slop:: Set the default slop for phrase queries. This
|
256
|
+
# defaults to 0.
|
257
|
+
# handle_parse_errors:: Set this to true if you want the QueryParser to
|
258
|
+
# degrade gracefully on errors. If the query parser
|
259
|
+
# fails to parse this query, it will try to parse it
|
260
|
+
# as a straight boolean query on the default field
|
261
|
+
# ignoring all query punctuation. If this fails, it
|
262
|
+
# will return an empty TermQuery. If you use this
|
263
|
+
# and you need to know why your query isn't working
|
264
|
+
# you can use the Query#to_s method on the query
|
265
|
+
# returned to see what is happening to your query.
|
266
|
+
# This defualts to false, in which case a
|
267
|
+
# QueryParseException is thrown.
|
256
268
|
def initialize(default_field = "", options = {})
|
257
269
|
end
|
258
270
|
|
@@ -263,10 +275,10 @@ module Ferret
|
|
263
275
|
|
264
276
|
# Set to false if you don't want the terms in fuzzy and wild queries to be
|
265
277
|
# set to lower case. You should do this if your analyzer doesn't downcase.
|
266
|
-
def wild_lower()
|
278
|
+
def wild_lower=()
|
267
279
|
end
|
268
280
|
|
269
|
-
# Returns the value of wild_lower. See #wild_lower
|
281
|
+
# Returns the value of wild_lower. See #wild_lower=.
|
270
282
|
def wild_lower?()
|
271
283
|
end
|
272
284
|
|
@@ -276,7 +288,25 @@ module Ferret
|
|
276
288
|
# if you'd like to do your own query string cleaning.
|
277
289
|
def clean_string(str)
|
278
290
|
end
|
291
|
+
|
292
|
+
# The exception thrown when there is an error parsing the query string.
|
293
|
+
# This also holds the Racc::ParseError that was thrown in case you want to
|
294
|
+
# investigate why a query won't parse.
|
295
|
+
class QueryParseException < Exception
|
296
|
+
attr_reader :parse_error
|
297
|
+
|
298
|
+
# Create a new QueryParseException
|
299
|
+
#
|
300
|
+
# error:: An error string describing the query that failed
|
301
|
+
# parse_error:: The actual parse error that was thrown by Racc. It is a
|
302
|
+
# Racc::ParseError object.
|
303
|
+
def initialize(error, parse_error)
|
304
|
+
super(error)
|
305
|
+
@parse_error = parse_error
|
306
|
+
end
|
307
|
+
end
|
279
308
|
end
|
309
|
+
|
280
310
|
end
|
281
311
|
|
282
312
|
require 'ferret/query_parser/query_parser.tab.rb'
|
@@ -11,15 +11,8 @@ module Ferret
|
|
11
11
|
|
12
12
|
class QueryParser < Racc::Parser
|
13
13
|
|
14
|
-
module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..
|
15
|
-
attr_accessor :default_field, :fields
|
16
|
-
|
17
|
-
# true if you want to downcase wild card queries. This is set to try by
|
18
|
-
# default.
|
19
|
-
attr_writer :wild_lower
|
20
|
-
|
21
|
-
def wild_lower?() @wild_lower end
|
22
|
-
|
14
|
+
module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492', 'lib/ferret/query_parser/query_parser.y', 126
|
15
|
+
attr_accessor :default_field, :fields, :handle_parse_errors
|
23
16
|
|
24
17
|
def initialize(default_field = "*", options = {})
|
25
18
|
@yydebug = true
|
@@ -32,6 +25,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
32
25
|
@occur_default = options[:occur_default] || BooleanClause::Occur::SHOULD
|
33
26
|
@default_slop = options[:default_slop] || 0
|
34
27
|
@fields = options[:fields]||[]
|
28
|
+
@handle_parse_errors = options[:handle_parse_errors] || false
|
35
29
|
end
|
36
30
|
|
37
31
|
RESERVED = {
|
@@ -50,6 +44,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
50
44
|
EWCHR = %q,:()\[\]{}!+"~^\-\|<>\=,
|
51
45
|
|
52
46
|
def parse(str)
|
47
|
+
orig_str = str
|
53
48
|
str = clean_string(str)
|
54
49
|
str.strip!
|
55
50
|
@q = []
|
@@ -82,10 +77,24 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
82
77
|
end
|
83
78
|
str = $'
|
84
79
|
end
|
85
|
-
@q.
|
80
|
+
if @q.empty?
|
81
|
+
return TermQuery.new(Term.new(@default_field, ""))
|
82
|
+
end
|
83
|
+
|
84
|
+
@q.push([ false, '$' ])
|
86
85
|
#p @q
|
87
86
|
|
88
|
-
|
87
|
+
begin
|
88
|
+
query = do_parse
|
89
|
+
rescue Racc::ParseError => e
|
90
|
+
if @handle_parse_errors
|
91
|
+
@field = @default_field
|
92
|
+
query = _get_bad_query(orig_str)
|
93
|
+
else
|
94
|
+
raise QueryParseException.new("Could not parse #{str}", e)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
return query
|
89
98
|
end
|
90
99
|
|
91
100
|
def next_token
|
@@ -160,6 +169,25 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
160
169
|
return new_str.pack("c*")
|
161
170
|
end
|
162
171
|
|
172
|
+
def get_bad_query(field, str)
|
173
|
+
tokens = []
|
174
|
+
stream = @analyzer.token_stream(field, str)
|
175
|
+
while token = stream.next
|
176
|
+
tokens << token
|
177
|
+
end
|
178
|
+
if tokens.length == 0
|
179
|
+
return TermQuery.new(Term.new(field, ""))
|
180
|
+
elsif tokens.length == 1
|
181
|
+
return TermQuery.new(Term.new(field, tokens[0].term_text))
|
182
|
+
else
|
183
|
+
bq = BooleanQuery.new()
|
184
|
+
tokens.each do |token|
|
185
|
+
bq << BooleanClause.new(TermQuery.new(Term.new(field, token.term_text)))
|
186
|
+
end
|
187
|
+
return bq
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
163
191
|
def get_range_query(field, start_word, end_word, inc_upper, inc_lower)
|
164
192
|
RangeQuery.new(field, start_word, end_word, inc_upper, inc_lower)
|
165
193
|
end
|
@@ -374,7 +402,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
374
402
|
return qp.parse(query)
|
375
403
|
end
|
376
404
|
|
377
|
-
..end lib/ferret/query_parser/query_parser.y modeval..
|
405
|
+
..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492
|
378
406
|
|
379
407
|
##### racc 1.4.4 generates ###
|
380
408
|
|
@@ -893,7 +921,8 @@ if __FILE__ == $0
|
|
893
921
|
|
894
922
|
parser = Ferret::QueryParser.new("default",
|
895
923
|
:fields => ["f1", "f2", "f3"],
|
896
|
-
:analyzer => Ferret::Analysis::StandardAnalyzer.new
|
924
|
+
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
925
|
+
:handle_parse_errors => true)
|
897
926
|
|
898
927
|
$stdin.each do |line|
|
899
928
|
query = parser.parse(line)
|
@@ -90,12 +90,17 @@ module Ferret::Search
|
|
90
90
|
filter = options[:filter]
|
91
91
|
first_doc = options[:first_doc]||0
|
92
92
|
num_docs = options[:num_docs]||10
|
93
|
+
max_size = first_doc + num_docs
|
93
94
|
sort = options[:sort]
|
94
95
|
|
95
|
-
if (num_docs <= 0)
|
96
|
+
if (num_docs <= 0)
|
96
97
|
raise ArgumentError, "num_docs must be > 0 to run a search"
|
97
98
|
end
|
98
99
|
|
100
|
+
if (first_doc < 0)
|
101
|
+
raise ArgumentError, "first_doc must be >= 0 to run a search"
|
102
|
+
end
|
103
|
+
|
99
104
|
scorer = query.weight(self).scorer(@reader)
|
100
105
|
if (scorer == nil)
|
101
106
|
return TopDocs.new(0, [])
|
@@ -104,33 +109,32 @@ module Ferret::Search
|
|
104
109
|
bits = (filter.nil? ? nil : filter.bits(@reader))
|
105
110
|
if (sort)
|
106
111
|
fields = sort.is_a?(Array) ? sort : sort.fields
|
107
|
-
hq = FieldSortedHitQueue.new(@reader, fields,
|
112
|
+
hq = FieldSortedHitQueue.new(@reader, fields, max_size)
|
108
113
|
else
|
109
|
-
hq = HitQueue.new(
|
114
|
+
hq = HitQueue.new(max_size)
|
110
115
|
end
|
111
116
|
total_hits = 0
|
112
117
|
min_score = 0.0
|
113
118
|
scorer.each_hit() do |doc, score|
|
114
119
|
if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
|
115
120
|
total_hits += 1
|
116
|
-
if hq.size <
|
121
|
+
if hq.size < max_size or score >= min_score
|
117
122
|
hq.insert(ScoreDoc.new(doc, score))
|
118
123
|
min_score = hq.top.score # maintain min_score
|
119
124
|
end
|
120
125
|
end
|
121
126
|
end
|
122
127
|
|
123
|
-
score_docs =
|
128
|
+
score_docs = []
|
124
129
|
if (hq.size > first_doc)
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
130
|
+
if (hq.size - first_doc) < num_docs
|
131
|
+
num_docs = hq.size - first_doc
|
132
|
+
end
|
133
|
+
num_docs.times do
|
134
|
+
score_docs.unshift(hq.pop)
|
129
135
|
end
|
130
|
-
else
|
131
|
-
score_docs = []
|
132
|
-
hq.clear
|
133
136
|
end
|
137
|
+
hq.clear
|
134
138
|
|
135
139
|
return TopDocs.new(total_hits, score_docs)
|
136
140
|
end
|
@@ -37,13 +37,15 @@ class FieldTest < Test::Unit::TestCase
|
|
37
37
|
assert_equal(false, f.store_offsets?)
|
38
38
|
assert_equal(false, f.store_positions?)
|
39
39
|
assert_equal(false, f.binary?)
|
40
|
+
assert_equal("stored/compressed,indexed,tokenized,<name:value>", f.to_s)
|
40
41
|
end
|
41
42
|
|
42
43
|
def test_set_store()
|
43
|
-
f = Field.new("name",
|
44
|
+
f = Field.new("name", nil, Field::Store::COMPRESS, Field::Index::TOKENIZED)
|
44
45
|
f.stored = Field::Store::NO
|
45
46
|
assert_equal(false, f.stored?)
|
46
47
|
assert_equal(false, f.compressed?)
|
48
|
+
assert_equal("indexed,tokenized,<name:>", f.to_s)
|
47
49
|
end
|
48
50
|
|
49
51
|
def test_set_index()
|
@@ -51,6 +53,7 @@ class FieldTest < Test::Unit::TestCase
|
|
51
53
|
f.index = Field::Index::NO
|
52
54
|
assert_equal(false, f.indexed?)
|
53
55
|
assert_equal(false, f.tokenized?)
|
56
|
+
assert_equal("stored/compressed,<name:value>", f.to_s)
|
54
57
|
end
|
55
58
|
|
56
59
|
def test_set_term_vector()
|
@@ -59,6 +62,7 @@ class FieldTest < Test::Unit::TestCase
|
|
59
62
|
assert_equal(true, f.store_term_vector?)
|
60
63
|
assert_equal(true, f.store_offsets?)
|
61
64
|
assert_equal(true, f.store_positions?)
|
65
|
+
assert_equal("stored/compressed,indexed,tokenized,store_term_vector,tv_offset,tv_position,<name:value>", f.to_s)
|
62
66
|
end
|
63
67
|
|
64
68
|
def test_new_binary_field()
|
@@ -76,5 +80,6 @@ class FieldTest < Test::Unit::TestCase
|
|
76
80
|
assert_equal(false, f.store_offsets?)
|
77
81
|
assert_equal(false, f.store_positions?)
|
78
82
|
assert_equal(true, f.binary?)
|
83
|
+
assert_equal("stored/uncompressed,binary,<name:#{bin}>", f.to_s)
|
79
84
|
end
|
80
85
|
end
|
@@ -5,6 +5,7 @@ class QueryParserTest < Test::Unit::TestCase
|
|
5
5
|
def test_strings()
|
6
6
|
parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2", "f3"])
|
7
7
|
pairs = [
|
8
|
+
['', ''],
|
8
9
|
['word', 'word'],
|
9
10
|
['field:word', 'field:word'],
|
10
11
|
['"word1 word2 word3"', '"word word word"'],
|
@@ -92,8 +93,8 @@ class QueryParserTest < Test::Unit::TestCase
|
|
92
93
|
['"onewordphrase"', 'onewordphrase']
|
93
94
|
]
|
94
95
|
|
95
|
-
pairs.each do |
|
96
|
-
assert_equal(
|
96
|
+
pairs.each do |query_str, expected|
|
97
|
+
assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
|
97
98
|
end
|
98
99
|
end
|
99
100
|
|
@@ -105,8 +106,32 @@ class QueryParserTest < Test::Unit::TestCase
|
|
105
106
|
['key:(1234)', 'key:1234']
|
106
107
|
]
|
107
108
|
|
108
|
-
pairs.each do |
|
109
|
-
assert_equal(
|
109
|
+
pairs.each do |query_str, expected|
|
110
|
+
assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def do_test_query_parse_exception_raised(str)
|
115
|
+
parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2", "f3"])
|
116
|
+
assert_raise(Ferret::QueryParser::QueryParseException) do
|
117
|
+
parser.parse(str)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
def test_bad_queries
|
123
|
+
parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2"],
|
124
|
+
:handle_parse_errors => true)
|
125
|
+
|
126
|
+
pairs = [
|
127
|
+
['(*word', 'word'],
|
128
|
+
['()*&)(*^&*(', ''],
|
129
|
+
['()*&one)(*two(*&"', 'one two']
|
130
|
+
]
|
131
|
+
|
132
|
+
pairs.each do |query_str, expected|
|
133
|
+
do_test_query_parse_exception_raised(query_str)
|
134
|
+
assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
|
110
135
|
end
|
111
136
|
end
|
112
137
|
end
|
@@ -46,6 +46,15 @@ class IndexSearcherTest < Test::Unit::TestCase
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
+
def check_docs(query, options, expected=[])
|
50
|
+
top_docs = @is.search(query, options)
|
51
|
+
docs = top_docs.score_docs
|
52
|
+
assert_equal(expected.length, docs.length)
|
53
|
+
docs.length.times do |i|
|
54
|
+
assert_equal(expected[i], docs[i].doc)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
49
58
|
def test_get_doc()
|
50
59
|
assert_equal(18, @is.max_doc)
|
51
60
|
assert_equal("20050930", @is.doc(0).values(:date))
|
@@ -57,15 +66,38 @@ class IndexSearcherTest < Test::Unit::TestCase
|
|
57
66
|
tq.boost = 100
|
58
67
|
check_hits(tq, [1,4,8])
|
59
68
|
|
69
|
+
tq = TermQuery.new(Term.new("field", ""));
|
70
|
+
check_hits(tq, [])
|
71
|
+
|
60
72
|
tq = TermQuery.new(Term.new("field", "word1"));
|
61
73
|
top_docs = @is.search(tq)
|
62
|
-
#puts top_docs.score_docs
|
63
74
|
assert_equal(@documents.size, top_docs.total_hits)
|
64
75
|
assert_equal(10, top_docs.score_docs.size)
|
65
76
|
top_docs = @is.search(tq, {:num_docs => 20})
|
66
77
|
assert_equal(@documents.size, top_docs.score_docs.size)
|
67
78
|
end
|
68
79
|
|
80
|
+
|
81
|
+
def test_first_doc
|
82
|
+
tq = TermQuery.new(Term.new("field", "word1"));
|
83
|
+
tq.boost = 100
|
84
|
+
top_docs = @is.search(tq, {:num_docs => 100})
|
85
|
+
expected = []
|
86
|
+
top_docs.score_docs.each do |score_doc|
|
87
|
+
expected << score_doc.doc
|
88
|
+
end
|
89
|
+
|
90
|
+
assert_raise(ArgumentError) { @is.search(tq, {:first_doc => -1}) }
|
91
|
+
assert_raise(ArgumentError) { @is.search(tq, {:num_docs => 0}) }
|
92
|
+
assert_raise(ArgumentError) { @is.search(tq, {:num_docs => -1}) }
|
93
|
+
|
94
|
+
check_docs(tq, {:num_docs => 8, :first_doc => 0}, expected[0,8])
|
95
|
+
check_docs(tq, {:num_docs => 3, :first_doc => 1}, expected[1,3])
|
96
|
+
check_docs(tq, {:num_docs => 6, :first_doc => 2}, expected[2,6])
|
97
|
+
check_docs(tq, {:num_docs => 2, :first_doc => expected.length}, [])
|
98
|
+
check_docs(tq, {:num_docs => 2, :first_doc => expected.length + 100}, [])
|
99
|
+
end
|
100
|
+
|
69
101
|
def test_boolean_query
|
70
102
|
bq = BooleanQuery.new()
|
71
103
|
tq1 = TermQuery.new(Term.new("field", "word1"))
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ferret
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2005-11-
|
6
|
+
version: 0.2.2
|
7
|
+
date: 2005-11-22 00:00:00 +09:00
|
8
8
|
summary: Ruby indexing library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|