ferret 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/analyzers.rb +5 -4
- data/lib/ferret/document/field.rb +8 -21
- data/lib/ferret/index/compound_file_io.rb +14 -22
- data/lib/ferret/index/index.rb +20 -1
- data/lib/ferret/index/index_reader.rb +1 -1
- data/lib/ferret/index/index_writer.rb +6 -6
- data/lib/ferret/index/segment_reader.rb +13 -12
- data/lib/ferret/query_parser.rb +43 -13
- data/lib/ferret/query_parser/query_parser.tab.rb +42 -13
- data/lib/ferret/search/index_searcher.rb +16 -12
- data/lib/ferret/search/term_scorer.rb +0 -2
- data/test/unit/document/tc_field.rb +6 -1
- data/test/unit/query_parser/tc_query_parser.rb +29 -4
- data/test/unit/search/tc_index_searcher.rb +33 -1
- metadata +2 -2
data/lib/ferret.rb
CHANGED
@@ -32,9 +32,8 @@ module Ferret::Analysis
|
|
32
32
|
# An array containing some common English words that are not usually useful
|
33
33
|
# for searching.
|
34
34
|
ENGLISH_STOP_WORDS = [
|
35
|
-
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
36
|
-
"
|
37
|
-
"no", "not", "of", "on", "or", "s", "such",
|
35
|
+
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
|
36
|
+
"in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such",
|
38
37
|
"t", "that", "the", "their", "then", "there", "these",
|
39
38
|
"they", "this", "to", "was", "will", "with"
|
40
39
|
]
|
@@ -51,6 +50,8 @@ module Ferret::Analysis
|
|
51
50
|
end
|
52
51
|
|
53
52
|
# An Analyzer that filters LetterTokenizer with LowerCaseFilter.
|
53
|
+
# This analyzer subclasses the StopAnalyzer so you can add your own
|
54
|
+
# stoplist the same way. See StopAnalyzer.
|
54
55
|
class StandardAnalyzer < StopAnalyzer
|
55
56
|
def token_stream(field, string)
|
56
57
|
return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
|
@@ -84,7 +85,7 @@ module Ferret::Analysis
|
|
84
85
|
def token_stream(field, string)
|
85
86
|
analyzer = @analyzers[field]
|
86
87
|
if (analyzer == nil)
|
87
|
-
analyzer = @default_analyzer
|
88
|
+
analyzer = @default_analyzer
|
88
89
|
end
|
89
90
|
|
90
91
|
return analyzer.token_stream(field, string)
|
@@ -277,28 +277,15 @@ module Ferret::Document
|
|
277
277
|
str = ""
|
278
278
|
if (@stored)
|
279
279
|
str << "stored"
|
280
|
-
|
280
|
+
str << (@compressed ? "/compressed," : "/uncompressed,")
|
281
281
|
end
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
if (@store_offset)
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
str << "term_vector_position,"
|
290
|
-
end
|
291
|
-
if (@binary) then str << "binary," end
|
292
|
-
|
293
|
-
str << '<'
|
294
|
-
str << @name
|
295
|
-
str << ':'
|
296
|
-
|
297
|
-
if (@data != null)
|
298
|
-
str << @data.to_s
|
299
|
-
end
|
300
|
-
|
301
|
-
str << '>'
|
282
|
+
str << "indexed," if (@indexed)
|
283
|
+
str << "tokenized," if (@tokenized)
|
284
|
+
str << "store_term_vector," if (@store_term_vector)
|
285
|
+
str << "tv_offset," if (@store_offset)
|
286
|
+
str << "tv_position," if (@store_position)
|
287
|
+
str << "binary," if (@binary)
|
288
|
+
str << "<#{@name}:#{data}>"
|
302
289
|
end
|
303
290
|
end
|
304
291
|
end
|
@@ -107,10 +107,10 @@ module Ferret::Index
|
|
107
107
|
end
|
108
108
|
|
109
109
|
# Not implemented
|
110
|
-
def
|
110
|
+
def remove(name) raise(NotImplementedError) end
|
111
111
|
|
112
112
|
# Not implemented
|
113
|
-
def rename(from, to) raise(
|
113
|
+
def rename(from, to) raise(NotImplementedError) end
|
114
114
|
|
115
115
|
# Returns the length of a file in the directory.
|
116
116
|
def length(name)
|
@@ -120,10 +120,10 @@ module Ferret::Index
|
|
120
120
|
end
|
121
121
|
|
122
122
|
# Not implemented
|
123
|
-
def create_output(name) raise(
|
123
|
+
def create_output(name) raise(NotImplementedError) end
|
124
124
|
|
125
125
|
# Not implemented
|
126
|
-
def make_lock(name) raise(
|
126
|
+
def make_lock(name) raise(NotImplementedError) end
|
127
127
|
|
128
128
|
# Implementation of an IndexInput that reads from a portion of the
|
129
129
|
# compound file.
|
@@ -206,8 +206,8 @@ module Ferret::Index
|
|
206
206
|
# Add a source stream. _file_name_ is the string by which the
|
207
207
|
# sub-stream will be known in the compound stream.
|
208
208
|
#
|
209
|
-
#
|
210
|
-
#
|
209
|
+
# Raises:: StateError if this writer is closed
|
210
|
+
# Raises:: ArgumentError if a file with the same name
|
211
211
|
# has been added already
|
212
212
|
def add_file(file_name)
|
213
213
|
if @merged
|
@@ -253,7 +253,7 @@ module Ferret::Index
|
|
253
253
|
# Remember the positions of directory entries so that we can
|
254
254
|
# adjust the offsets later
|
255
255
|
@file_entries.each do |fe|
|
256
|
-
fe.
|
256
|
+
fe.dir_offset = os.pos()
|
257
257
|
os.write_long(0) # for now
|
258
258
|
os.write_string(fe.file_name)
|
259
259
|
end
|
@@ -267,7 +267,7 @@ module Ferret::Index
|
|
267
267
|
|
268
268
|
# Write the data offsets into the directory of the compound stream
|
269
269
|
@file_entries.each do |fe|
|
270
|
-
os.seek(fe.
|
270
|
+
os.seek(fe.dir_offset)
|
271
271
|
os.write_long(fe.data_offset)
|
272
272
|
end
|
273
273
|
|
@@ -292,15 +292,7 @@ module Ferret::Index
|
|
292
292
|
private
|
293
293
|
|
294
294
|
# Internal class for holding a file
|
295
|
-
|
296
|
-
|
297
|
-
attr_accessor :file_name, :directory_offset, :data_offset
|
298
|
-
|
299
|
-
def initialize(file_name)
|
300
|
-
@file_name = file_name
|
301
|
-
end
|
302
|
-
|
303
|
-
end
|
295
|
+
FileEntry = Struct.new(:file_name, :dir_offset, :data_offset)
|
304
296
|
|
305
297
|
# Copy the contents of the file with specified extension into the
|
306
298
|
# provided output stream. Use a buffer for moving data
|
@@ -324,9 +316,9 @@ module Ferret::Index
|
|
324
316
|
# Verify that remainder is 0
|
325
317
|
if (remainder != 0)
|
326
318
|
raise(IOError,
|
327
|
-
"Non-zero remainder length after copying: " +
|
328
|
-
|
329
|
-
|
319
|
+
"Non-zero remainder length after copying: #{remainder} " +
|
320
|
+
"(id: #{source.file_name}, length: #{length}, buffer size: " +
|
321
|
+
" #{Ferret::Store::BUFFER_SIZE})")
|
330
322
|
end
|
331
323
|
|
332
324
|
# Verify that the output length diff is equal to original file
|
@@ -334,8 +326,8 @@ module Ferret::Index
|
|
334
326
|
diff = end_ptr - start_ptr
|
335
327
|
if (diff != length)
|
336
328
|
raise(IOError,
|
337
|
-
"Difference in the output file offsets " +
|
338
|
-
|
329
|
+
"Difference in the output file offsets #{diff}" +
|
330
|
+
" does not match the original file length #{length}")
|
339
331
|
end
|
340
332
|
|
341
333
|
ensure
|
data/lib/ferret/index/index.rb
CHANGED
@@ -76,6 +76,23 @@ module Ferret::Index
|
|
76
76
|
# be replaced by the new object. This will slow
|
77
77
|
# down indexing so it should not be used if
|
78
78
|
# performance is a concern.
|
79
|
+
# use_compound_file:: Uses a compound file to store the index. This
|
80
|
+
# prevents an error being raised for having too
|
81
|
+
# many files open at the same time. The default is
|
82
|
+
# true but performance is better if this is set to
|
83
|
+
# false.
|
84
|
+
# handle_parse_errors:: Set this to true if you want the QueryParser to
|
85
|
+
# degrade gracefully on errors. If the query parser
|
86
|
+
# fails to parse this query, it will try to parse
|
87
|
+
# it as a straight boolean query on the default
|
88
|
+
# field ignoring all query punctuation. If this
|
89
|
+
# fails, it will return an empty TermQuery. If you
|
90
|
+
# use this and you need to know why your query
|
91
|
+
# isn't working you can use the Query#to_s method
|
92
|
+
# on the query returned to see what is happening to
|
93
|
+
# your query. This defualts to true. If you set it
|
94
|
+
# to false a QueryParseException is raised on a
|
95
|
+
# query parse error.
|
79
96
|
#
|
80
97
|
# Some examples;
|
81
98
|
#
|
@@ -86,7 +103,8 @@ module Ferret::Index
|
|
86
103
|
#
|
87
104
|
# index = Index::Index.new(:dir => directory,
|
88
105
|
# :close_dir => false
|
89
|
-
# :default_slop => 2
|
106
|
+
# :default_slop => 2,
|
107
|
+
# :handle_parse_errors => false)
|
90
108
|
#
|
91
109
|
def initialize(options = {})
|
92
110
|
super()
|
@@ -117,6 +135,7 @@ module Ferret::Index
|
|
117
135
|
@default_search_field = (@options[:default_search_field] || \
|
118
136
|
@options[:default_field] || "*")
|
119
137
|
@default_field = @options[:default_field] || ""
|
138
|
+
@options[:handle_parse_errors] = true if @options[:handle_parse_errors].nil?
|
120
139
|
@open = true
|
121
140
|
@qp = nil
|
122
141
|
end
|
@@ -100,7 +100,7 @@ module Ferret::Index
|
|
100
100
|
if directory.nil?
|
101
101
|
directory = Ferret::Store::RAMDirectory.new
|
102
102
|
elsif directory.is_a?(String)
|
103
|
-
directory = Ferret::Store::FSDirectory.new(directory,
|
103
|
+
directory = Ferret::Store::FSDirectory.new(directory, false)
|
104
104
|
end
|
105
105
|
directory.synchronize do # in- & inter-process sync
|
106
106
|
commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
|
@@ -83,21 +83,21 @@ module Index
|
|
83
83
|
@close_dir = options[:close_dir] || false
|
84
84
|
@use_compound_file = (options[:use_compound_file] != false) # ie default true
|
85
85
|
@analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
|
86
|
-
@merge_factor = DEFAULT_MERGE_FACTOR
|
87
|
-
@min_merge_docs = DEFAULT_MIN_MERGE_DOCS
|
88
|
-
@max_merge_docs = DEFAULT_MAX_MERGE_DOCS
|
89
|
-
@max_field_length = DEFAULT_MAX_FIELD_LENGTH
|
90
|
-
@term_index_interval = DEFAULT_TERM_INDEX_INTERVAL
|
86
|
+
@merge_factor = options[:merge_factor] || DEFAULT_MERGE_FACTOR
|
87
|
+
@min_merge_docs = options[:min_merge_docs] || DEFAULT_MIN_MERGE_DOCS
|
88
|
+
@max_merge_docs = options[:max_merge_docs] || DEFAULT_MAX_MERGE_DOCS
|
89
|
+
@max_field_length = options[:max_field_length] || DEFAULT_MAX_FIELD_LENGTH
|
90
|
+
@term_index_interval = options[:term_index_interval] || DEFAULT_TERM_INDEX_INTERVAL
|
91
91
|
|
92
92
|
@similarity = Search::Similarity.default
|
93
93
|
@segment_infos = SegmentInfos.new()
|
94
94
|
@ram_directory = Ferret::Store::RAMDirectory.new()
|
95
95
|
|
96
96
|
# Make sure that the lock is released when this object is destroyed
|
97
|
-
define_finalizer(self, proc { |id| @write_lock.release() if @write_lock})
|
98
97
|
|
99
98
|
@write_lock = @directory.make_lock(WRITE_LOCK_NAME)
|
100
99
|
@write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
|
100
|
+
define_finalizer(@write_lock, proc { |id| @write_lock.release() if @write_lock})
|
101
101
|
|
102
102
|
@directory.synchronize() do # in- & inter-process sync
|
103
103
|
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
@@ -16,16 +16,17 @@ module Ferret::Index
|
|
16
16
|
@segment = info.name
|
17
17
|
|
18
18
|
@cfs_reader = nil
|
19
|
-
|
20
|
-
if directory.exists?(@segment + '.cfs') then
|
19
|
+
dir = directory
|
20
|
+
#if directory.exists?(@segment + '.cfs') then
|
21
|
+
if SegmentReader.uses_compound_file?(info)
|
21
22
|
@cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
|
22
|
-
|
23
|
+
dir = @cfs_reader
|
23
24
|
end
|
24
25
|
|
25
|
-
@field_infos = FieldInfos.new(
|
26
|
-
@fields_reader = FieldsReader.new(
|
26
|
+
@field_infos = FieldInfos.new(dir, @segment + '.fnm')
|
27
|
+
@fields_reader = FieldsReader.new(dir, @segment, @field_infos)
|
27
28
|
|
28
|
-
@term_infos = TermInfosReader.new(
|
29
|
+
@term_infos = TermInfosReader.new(dir, @segment, @field_infos)
|
29
30
|
@deleted_docs = nil
|
30
31
|
@deleted_docs_dirty = false
|
31
32
|
if SegmentReader.has_deletions?(info) then
|
@@ -33,16 +34,16 @@ module Ferret::Index
|
|
33
34
|
Ferret::Utils::BitVector.read(directory, @segment + '.del')
|
34
35
|
end
|
35
36
|
|
36
|
-
@freq_stream =
|
37
|
-
@prox_stream =
|
37
|
+
@freq_stream = dir.open_input(@segment + '.frq')
|
38
|
+
@prox_stream = dir.open_input(@segment + '.prx')
|
38
39
|
@norms = {}
|
39
40
|
@norms.extend(MonitorMixin)
|
40
41
|
@norms_dirty = false
|
41
|
-
open_norms(
|
42
|
+
open_norms(dir)
|
42
43
|
|
43
44
|
@tv_reader_orig = nil
|
44
45
|
if @field_infos.has_vectors? then
|
45
|
-
@tv_reader_orig = TermVectorsReader.new(
|
46
|
+
@tv_reader_orig = TermVectorsReader.new(dir, @segment, @field_infos)
|
46
47
|
end
|
47
48
|
end
|
48
49
|
|
@@ -128,9 +129,9 @@ module Ferret::Index
|
|
128
129
|
@field_infos.each_with_index do |fi, i|
|
129
130
|
if (fi.indexed?)
|
130
131
|
if @cfs_reader.nil?
|
131
|
-
name = @segment
|
132
|
+
name = "#{@segment}.f#{i}"
|
132
133
|
else
|
133
|
-
name = @segment
|
134
|
+
name = "#{@segment}.s#{i}"
|
134
135
|
end
|
135
136
|
if (@directory.exists?(name))
|
136
137
|
file_names << name
|
data/lib/ferret/query_parser.rb
CHANGED
@@ -242,17 +242,29 @@ module Ferret
|
|
242
242
|
#
|
243
243
|
# === Options
|
244
244
|
#
|
245
|
-
# analyzer::
|
246
|
-
#
|
247
|
-
#
|
248
|
-
# occur_default::
|
249
|
-
#
|
250
|
-
#
|
251
|
-
# wild_lower::
|
252
|
-
#
|
253
|
-
#
|
254
|
-
#
|
255
|
-
#
|
245
|
+
# analyzer:: The analyzer is used to break phrases up into
|
246
|
+
# terms and to turn terms in tokens recognized in
|
247
|
+
# the index. Analysis::Analyzer is the default
|
248
|
+
# occur_default:: Set to either BooleanClause::Occur::SHOULD
|
249
|
+
# (default) or BooleanClause::Occur::MUST to specify
|
250
|
+
# the default Occur operator.
|
251
|
+
# wild_lower:: Set to false if you don't want the terms in fuzzy
|
252
|
+
# and wild queries to be set to lower case. You
|
253
|
+
# should do this if your analyzer doesn't downcase.
|
254
|
+
# The default is true.
|
255
|
+
# default_slop:: Set the default slop for phrase queries. This
|
256
|
+
# defaults to 0.
|
257
|
+
# handle_parse_errors:: Set this to true if you want the QueryParser to
|
258
|
+
# degrade gracefully on errors. If the query parser
|
259
|
+
# fails to parse this query, it will try to parse it
|
260
|
+
# as a straight boolean query on the default field
|
261
|
+
# ignoring all query punctuation. If this fails, it
|
262
|
+
# will return an empty TermQuery. If you use this
|
263
|
+
# and you need to know why your query isn't working
|
264
|
+
# you can use the Query#to_s method on the query
|
265
|
+
# returned to see what is happening to your query.
|
266
|
+
# This defualts to false, in which case a
|
267
|
+
# QueryParseException is thrown.
|
256
268
|
def initialize(default_field = "", options = {})
|
257
269
|
end
|
258
270
|
|
@@ -263,10 +275,10 @@ module Ferret
|
|
263
275
|
|
264
276
|
# Set to false if you don't want the terms in fuzzy and wild queries to be
|
265
277
|
# set to lower case. You should do this if your analyzer doesn't downcase.
|
266
|
-
def wild_lower()
|
278
|
+
def wild_lower=()
|
267
279
|
end
|
268
280
|
|
269
|
-
# Returns the value of wild_lower. See #wild_lower
|
281
|
+
# Returns the value of wild_lower. See #wild_lower=.
|
270
282
|
def wild_lower?()
|
271
283
|
end
|
272
284
|
|
@@ -276,7 +288,25 @@ module Ferret
|
|
276
288
|
# if you'd like to do your own query string cleaning.
|
277
289
|
def clean_string(str)
|
278
290
|
end
|
291
|
+
|
292
|
+
# The exception thrown when there is an error parsing the query string.
|
293
|
+
# This also holds the Racc::ParseError that was thrown in case you want to
|
294
|
+
# investigate why a query won't parse.
|
295
|
+
class QueryParseException < Exception
|
296
|
+
attr_reader :parse_error
|
297
|
+
|
298
|
+
# Create a new QueryParseException
|
299
|
+
#
|
300
|
+
# error:: An error string describing the query that failed
|
301
|
+
# parse_error:: The actual parse error that was thrown by Racc. It is a
|
302
|
+
# Racc::ParseError object.
|
303
|
+
def initialize(error, parse_error)
|
304
|
+
super(error)
|
305
|
+
@parse_error = parse_error
|
306
|
+
end
|
307
|
+
end
|
279
308
|
end
|
309
|
+
|
280
310
|
end
|
281
311
|
|
282
312
|
require 'ferret/query_parser/query_parser.tab.rb'
|
@@ -11,15 +11,8 @@ module Ferret
|
|
11
11
|
|
12
12
|
class QueryParser < Racc::Parser
|
13
13
|
|
14
|
-
module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..
|
15
|
-
attr_accessor :default_field, :fields
|
16
|
-
|
17
|
-
# true if you want to downcase wild card queries. This is set to try by
|
18
|
-
# default.
|
19
|
-
attr_writer :wild_lower
|
20
|
-
|
21
|
-
def wild_lower?() @wild_lower end
|
22
|
-
|
14
|
+
module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492', 'lib/ferret/query_parser/query_parser.y', 126
|
15
|
+
attr_accessor :default_field, :fields, :handle_parse_errors
|
23
16
|
|
24
17
|
def initialize(default_field = "*", options = {})
|
25
18
|
@yydebug = true
|
@@ -32,6 +25,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
32
25
|
@occur_default = options[:occur_default] || BooleanClause::Occur::SHOULD
|
33
26
|
@default_slop = options[:default_slop] || 0
|
34
27
|
@fields = options[:fields]||[]
|
28
|
+
@handle_parse_errors = options[:handle_parse_errors] || false
|
35
29
|
end
|
36
30
|
|
37
31
|
RESERVED = {
|
@@ -50,6 +44,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
50
44
|
EWCHR = %q,:()\[\]{}!+"~^\-\|<>\=,
|
51
45
|
|
52
46
|
def parse(str)
|
47
|
+
orig_str = str
|
53
48
|
str = clean_string(str)
|
54
49
|
str.strip!
|
55
50
|
@q = []
|
@@ -82,10 +77,24 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
82
77
|
end
|
83
78
|
str = $'
|
84
79
|
end
|
85
|
-
@q.
|
80
|
+
if @q.empty?
|
81
|
+
return TermQuery.new(Term.new(@default_field, ""))
|
82
|
+
end
|
83
|
+
|
84
|
+
@q.push([ false, '$' ])
|
86
85
|
#p @q
|
87
86
|
|
88
|
-
|
87
|
+
begin
|
88
|
+
query = do_parse
|
89
|
+
rescue Racc::ParseError => e
|
90
|
+
if @handle_parse_errors
|
91
|
+
@field = @default_field
|
92
|
+
query = _get_bad_query(orig_str)
|
93
|
+
else
|
94
|
+
raise QueryParseException.new("Could not parse #{str}", e)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
return query
|
89
98
|
end
|
90
99
|
|
91
100
|
def next_token
|
@@ -160,6 +169,25 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
160
169
|
return new_str.pack("c*")
|
161
170
|
end
|
162
171
|
|
172
|
+
def get_bad_query(field, str)
|
173
|
+
tokens = []
|
174
|
+
stream = @analyzer.token_stream(field, str)
|
175
|
+
while token = stream.next
|
176
|
+
tokens << token
|
177
|
+
end
|
178
|
+
if tokens.length == 0
|
179
|
+
return TermQuery.new(Term.new(field, ""))
|
180
|
+
elsif tokens.length == 1
|
181
|
+
return TermQuery.new(Term.new(field, tokens[0].term_text))
|
182
|
+
else
|
183
|
+
bq = BooleanQuery.new()
|
184
|
+
tokens.each do |token|
|
185
|
+
bq << BooleanClause.new(TermQuery.new(Term.new(field, token.term_text)))
|
186
|
+
end
|
187
|
+
return bq
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
163
191
|
def get_range_query(field, start_word, end_word, inc_upper, inc_lower)
|
164
192
|
RangeQuery.new(field, start_word, end_word, inc_upper, inc_lower)
|
165
193
|
end
|
@@ -374,7 +402,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id9e08d4407
|
|
374
402
|
return qp.parse(query)
|
375
403
|
end
|
376
404
|
|
377
|
-
..end lib/ferret/query_parser/query_parser.y modeval..
|
405
|
+
..end lib/ferret/query_parser/query_parser.y modeval..id81dbd43492
|
378
406
|
|
379
407
|
##### racc 1.4.4 generates ###
|
380
408
|
|
@@ -893,7 +921,8 @@ if __FILE__ == $0
|
|
893
921
|
|
894
922
|
parser = Ferret::QueryParser.new("default",
|
895
923
|
:fields => ["f1", "f2", "f3"],
|
896
|
-
:analyzer => Ferret::Analysis::StandardAnalyzer.new
|
924
|
+
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
925
|
+
:handle_parse_errors => true)
|
897
926
|
|
898
927
|
$stdin.each do |line|
|
899
928
|
query = parser.parse(line)
|
@@ -90,12 +90,17 @@ module Ferret::Search
|
|
90
90
|
filter = options[:filter]
|
91
91
|
first_doc = options[:first_doc]||0
|
92
92
|
num_docs = options[:num_docs]||10
|
93
|
+
max_size = first_doc + num_docs
|
93
94
|
sort = options[:sort]
|
94
95
|
|
95
|
-
if (num_docs <= 0)
|
96
|
+
if (num_docs <= 0)
|
96
97
|
raise ArgumentError, "num_docs must be > 0 to run a search"
|
97
98
|
end
|
98
99
|
|
100
|
+
if (first_doc < 0)
|
101
|
+
raise ArgumentError, "first_doc must be >= 0 to run a search"
|
102
|
+
end
|
103
|
+
|
99
104
|
scorer = query.weight(self).scorer(@reader)
|
100
105
|
if (scorer == nil)
|
101
106
|
return TopDocs.new(0, [])
|
@@ -104,33 +109,32 @@ module Ferret::Search
|
|
104
109
|
bits = (filter.nil? ? nil : filter.bits(@reader))
|
105
110
|
if (sort)
|
106
111
|
fields = sort.is_a?(Array) ? sort : sort.fields
|
107
|
-
hq = FieldSortedHitQueue.new(@reader, fields,
|
112
|
+
hq = FieldSortedHitQueue.new(@reader, fields, max_size)
|
108
113
|
else
|
109
|
-
hq = HitQueue.new(
|
114
|
+
hq = HitQueue.new(max_size)
|
110
115
|
end
|
111
116
|
total_hits = 0
|
112
117
|
min_score = 0.0
|
113
118
|
scorer.each_hit() do |doc, score|
|
114
119
|
if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
|
115
120
|
total_hits += 1
|
116
|
-
if hq.size <
|
121
|
+
if hq.size < max_size or score >= min_score
|
117
122
|
hq.insert(ScoreDoc.new(doc, score))
|
118
123
|
min_score = hq.top.score # maintain min_score
|
119
124
|
end
|
120
125
|
end
|
121
126
|
end
|
122
127
|
|
123
|
-
score_docs =
|
128
|
+
score_docs = []
|
124
129
|
if (hq.size > first_doc)
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
130
|
+
if (hq.size - first_doc) < num_docs
|
131
|
+
num_docs = hq.size - first_doc
|
132
|
+
end
|
133
|
+
num_docs.times do
|
134
|
+
score_docs.unshift(hq.pop)
|
129
135
|
end
|
130
|
-
else
|
131
|
-
score_docs = []
|
132
|
-
hq.clear
|
133
136
|
end
|
137
|
+
hq.clear
|
134
138
|
|
135
139
|
return TopDocs.new(total_hits, score_docs)
|
136
140
|
end
|
@@ -37,13 +37,15 @@ class FieldTest < Test::Unit::TestCase
|
|
37
37
|
assert_equal(false, f.store_offsets?)
|
38
38
|
assert_equal(false, f.store_positions?)
|
39
39
|
assert_equal(false, f.binary?)
|
40
|
+
assert_equal("stored/compressed,indexed,tokenized,<name:value>", f.to_s)
|
40
41
|
end
|
41
42
|
|
42
43
|
def test_set_store()
|
43
|
-
f = Field.new("name",
|
44
|
+
f = Field.new("name", nil, Field::Store::COMPRESS, Field::Index::TOKENIZED)
|
44
45
|
f.stored = Field::Store::NO
|
45
46
|
assert_equal(false, f.stored?)
|
46
47
|
assert_equal(false, f.compressed?)
|
48
|
+
assert_equal("indexed,tokenized,<name:>", f.to_s)
|
47
49
|
end
|
48
50
|
|
49
51
|
def test_set_index()
|
@@ -51,6 +53,7 @@ class FieldTest < Test::Unit::TestCase
|
|
51
53
|
f.index = Field::Index::NO
|
52
54
|
assert_equal(false, f.indexed?)
|
53
55
|
assert_equal(false, f.tokenized?)
|
56
|
+
assert_equal("stored/compressed,<name:value>", f.to_s)
|
54
57
|
end
|
55
58
|
|
56
59
|
def test_set_term_vector()
|
@@ -59,6 +62,7 @@ class FieldTest < Test::Unit::TestCase
|
|
59
62
|
assert_equal(true, f.store_term_vector?)
|
60
63
|
assert_equal(true, f.store_offsets?)
|
61
64
|
assert_equal(true, f.store_positions?)
|
65
|
+
assert_equal("stored/compressed,indexed,tokenized,store_term_vector,tv_offset,tv_position,<name:value>", f.to_s)
|
62
66
|
end
|
63
67
|
|
64
68
|
def test_new_binary_field()
|
@@ -76,5 +80,6 @@ class FieldTest < Test::Unit::TestCase
|
|
76
80
|
assert_equal(false, f.store_offsets?)
|
77
81
|
assert_equal(false, f.store_positions?)
|
78
82
|
assert_equal(true, f.binary?)
|
83
|
+
assert_equal("stored/uncompressed,binary,<name:#{bin}>", f.to_s)
|
79
84
|
end
|
80
85
|
end
|
@@ -5,6 +5,7 @@ class QueryParserTest < Test::Unit::TestCase
|
|
5
5
|
def test_strings()
|
6
6
|
parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2", "f3"])
|
7
7
|
pairs = [
|
8
|
+
['', ''],
|
8
9
|
['word', 'word'],
|
9
10
|
['field:word', 'field:word'],
|
10
11
|
['"word1 word2 word3"', '"word word word"'],
|
@@ -92,8 +93,8 @@ class QueryParserTest < Test::Unit::TestCase
|
|
92
93
|
['"onewordphrase"', 'onewordphrase']
|
93
94
|
]
|
94
95
|
|
95
|
-
pairs.each do |
|
96
|
-
assert_equal(
|
96
|
+
pairs.each do |query_str, expected|
|
97
|
+
assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
|
97
98
|
end
|
98
99
|
end
|
99
100
|
|
@@ -105,8 +106,32 @@ class QueryParserTest < Test::Unit::TestCase
|
|
105
106
|
['key:(1234)', 'key:1234']
|
106
107
|
]
|
107
108
|
|
108
|
-
pairs.each do |
|
109
|
-
assert_equal(
|
109
|
+
pairs.each do |query_str, expected|
|
110
|
+
assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def do_test_query_parse_exception_raised(str)
|
115
|
+
parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2", "f3"])
|
116
|
+
assert_raise(Ferret::QueryParser::QueryParseException) do
|
117
|
+
parser.parse(str)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
def test_bad_queries
|
123
|
+
parser = Ferret::QueryParser.new("xxx", :fields => ["f1", "f2"],
|
124
|
+
:handle_parse_errors => true)
|
125
|
+
|
126
|
+
pairs = [
|
127
|
+
['(*word', 'word'],
|
128
|
+
['()*&)(*^&*(', ''],
|
129
|
+
['()*&one)(*two(*&"', 'one two']
|
130
|
+
]
|
131
|
+
|
132
|
+
pairs.each do |query_str, expected|
|
133
|
+
do_test_query_parse_exception_raised(query_str)
|
134
|
+
assert_equal(expected, parser.parse(query_str).to_s(parser.default_field))
|
110
135
|
end
|
111
136
|
end
|
112
137
|
end
|
@@ -46,6 +46,15 @@ class IndexSearcherTest < Test::Unit::TestCase
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
+
def check_docs(query, options, expected=[])
|
50
|
+
top_docs = @is.search(query, options)
|
51
|
+
docs = top_docs.score_docs
|
52
|
+
assert_equal(expected.length, docs.length)
|
53
|
+
docs.length.times do |i|
|
54
|
+
assert_equal(expected[i], docs[i].doc)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
49
58
|
def test_get_doc()
|
50
59
|
assert_equal(18, @is.max_doc)
|
51
60
|
assert_equal("20050930", @is.doc(0).values(:date))
|
@@ -57,15 +66,38 @@ class IndexSearcherTest < Test::Unit::TestCase
|
|
57
66
|
tq.boost = 100
|
58
67
|
check_hits(tq, [1,4,8])
|
59
68
|
|
69
|
+
tq = TermQuery.new(Term.new("field", ""));
|
70
|
+
check_hits(tq, [])
|
71
|
+
|
60
72
|
tq = TermQuery.new(Term.new("field", "word1"));
|
61
73
|
top_docs = @is.search(tq)
|
62
|
-
#puts top_docs.score_docs
|
63
74
|
assert_equal(@documents.size, top_docs.total_hits)
|
64
75
|
assert_equal(10, top_docs.score_docs.size)
|
65
76
|
top_docs = @is.search(tq, {:num_docs => 20})
|
66
77
|
assert_equal(@documents.size, top_docs.score_docs.size)
|
67
78
|
end
|
68
79
|
|
80
|
+
|
81
|
+
def test_first_doc
|
82
|
+
tq = TermQuery.new(Term.new("field", "word1"));
|
83
|
+
tq.boost = 100
|
84
|
+
top_docs = @is.search(tq, {:num_docs => 100})
|
85
|
+
expected = []
|
86
|
+
top_docs.score_docs.each do |score_doc|
|
87
|
+
expected << score_doc.doc
|
88
|
+
end
|
89
|
+
|
90
|
+
assert_raise(ArgumentError) { @is.search(tq, {:first_doc => -1}) }
|
91
|
+
assert_raise(ArgumentError) { @is.search(tq, {:num_docs => 0}) }
|
92
|
+
assert_raise(ArgumentError) { @is.search(tq, {:num_docs => -1}) }
|
93
|
+
|
94
|
+
check_docs(tq, {:num_docs => 8, :first_doc => 0}, expected[0,8])
|
95
|
+
check_docs(tq, {:num_docs => 3, :first_doc => 1}, expected[1,3])
|
96
|
+
check_docs(tq, {:num_docs => 6, :first_doc => 2}, expected[2,6])
|
97
|
+
check_docs(tq, {:num_docs => 2, :first_doc => expected.length}, [])
|
98
|
+
check_docs(tq, {:num_docs => 2, :first_doc => expected.length + 100}, [])
|
99
|
+
end
|
100
|
+
|
69
101
|
def test_boolean_query
|
70
102
|
bq = BooleanQuery.new()
|
71
103
|
tq1 = TermQuery.new(Term.new("field", "word1"))
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ferret
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2005-11-
|
6
|
+
version: 0.2.2
|
7
|
+
date: 2005-11-22 00:00:00 +09:00
|
8
8
|
summary: Ruby indexing library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|