ferret 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/ferret.rb +1 -1
- data/lib/ferret/search/field_cache.rb +3 -3
- data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
- data/lib/ferret/search/fuzzy_term_enum.rb +1 -4
- data/lib/ferret/search/multi_phrase_query.rb +8 -4
- data/lib/ferret/search/score_doc_comparator.rb +3 -3
- data/lib/ferret/search/sort.rb +1 -0
- data/lib/ferret/search/sort_comparator.rb +2 -2
- data/lib/ferret/search/sort_field.rb +2 -2
- data/test/unit/index/tc_index.rb +38 -0
- data/test/unit/query_parser/tc_query_parser.rb +1 -0
- data/test/unit/search/tc_fuzzy_query.rb +3 -0
- data/test/unit/search/tc_index_searcher.rb +17 -3
- data/test/unit/search/tc_search_and_sort.rb +13 -12
- data/test/unit/search/tc_sort.rb +2 -2
- data/test/unit/search/tc_sort_field.rb +1 -1
- metadata +2 -2
data/lib/ferret.rb
CHANGED
@@ -85,7 +85,7 @@ module Ferret::Search
|
|
85
85
|
term = term_enum.term
|
86
86
|
break if (term.field != field)
|
87
87
|
termval = parser.call(term.text)
|
88
|
-
term_docs.seek(
|
88
|
+
term_docs.seek(term)
|
89
89
|
while term_docs.next?
|
90
90
|
index[term_docs.doc] = termval
|
91
91
|
end
|
@@ -141,7 +141,7 @@ module Ferret::Search
|
|
141
141
|
end
|
142
142
|
str_map[t] = term.text
|
143
143
|
|
144
|
-
term_docs.seek(
|
144
|
+
term_docs.seek(term)
|
145
145
|
while term_docs.next?
|
146
146
|
str_index[term_docs.doc] = t
|
147
147
|
end
|
@@ -192,7 +192,7 @@ module Ferret::Search
|
|
192
192
|
termtext = term.text.strip
|
193
193
|
|
194
194
|
if (termtext == termtext.to_i.to_s)
|
195
|
-
index = get_index(reader, field, SortField::SortType::
|
195
|
+
index = get_index(reader, field, SortField::SortType::INTEGER)
|
196
196
|
elsif (termtext == termtext.to_f.to_s or termtext == "%f"%termtext.to_f)
|
197
197
|
index = get_index(reader, field, SortField::SortType::FLOAT)
|
198
198
|
else
|
@@ -173,7 +173,7 @@ module Ferret::Search
|
|
173
173
|
if (index.is_a?(FieldCache::StringIndex))
|
174
174
|
return StringFieldComparator.new(index)
|
175
175
|
elsif (index[0].is_a?(Integer))
|
176
|
-
return SimpleFieldComparator.new(index, SortField::SortType::
|
176
|
+
return SimpleFieldComparator.new(index, SortField::SortType::INTEGER)
|
177
177
|
elsif (index[0].is_a?(Float))
|
178
178
|
return SimpleFieldComparator.new(index, SortField::SortType::FLOAT)
|
179
179
|
else
|
@@ -231,10 +231,7 @@ module Ferret::Search
|
|
231
231
|
# m:: the length of the "other value"
|
232
232
|
# returns:: the maximum levenshtein distance that we care about
|
233
233
|
def max_distance(m)
|
234
|
-
|
235
|
-
@max_distances[m] = calculate_max_distance(m)
|
236
|
-
end
|
237
|
-
return @max_distances[m]
|
234
|
+
return @max_distances[m] ||= calculate_max_distance(m)
|
238
235
|
end
|
239
236
|
|
240
237
|
def initialize_max_distances()
|
@@ -47,8 +47,12 @@ module Ferret::Search
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
|
51
|
-
|
50
|
+
if i = @positions.index(position)
|
51
|
+
term_arrays[i] += terms
|
52
|
+
else
|
53
|
+
@term_arrays << terms
|
54
|
+
@positions << position
|
55
|
+
end
|
52
56
|
end
|
53
57
|
alias :<< :add
|
54
58
|
|
@@ -167,10 +171,10 @@ module Ferret::Search
|
|
167
171
|
terms = @term_arrays[0]
|
168
172
|
bq = BooleanQuery.new(true)
|
169
173
|
terms.each do |term|
|
170
|
-
bq.
|
174
|
+
bq.add_query(TermQuery.new(term), BooleanClause::Occur::SHOULD)
|
171
175
|
end
|
172
176
|
bq.boost = boost()
|
173
|
-
return
|
177
|
+
return bq
|
174
178
|
else
|
175
179
|
return self
|
176
180
|
end
|
@@ -6,7 +6,7 @@ module Ferret::Search
|
|
6
6
|
RELEVANCE = ScoreDocComparator.new()
|
7
7
|
class <<RELEVANCE
|
8
8
|
def compare(i, j)
|
9
|
-
return
|
9
|
+
return j.score <=> i.score
|
10
10
|
end
|
11
11
|
def sort_value(i)
|
12
12
|
return i.score
|
@@ -74,7 +74,7 @@ module Ferret::Search
|
|
74
74
|
@sort_type = sort_type
|
75
75
|
end
|
76
76
|
|
77
|
-
def compare(
|
77
|
+
def compare(i, j)
|
78
78
|
return @index[i.doc] <=> @index[j.doc]
|
79
79
|
end
|
80
80
|
def sort_value(i)
|
@@ -90,7 +90,7 @@ module Ferret::Search
|
|
90
90
|
super(index, sort_type)
|
91
91
|
@comparator = comparator
|
92
92
|
end
|
93
|
-
def compare(
|
93
|
+
def compare(i, j)
|
94
94
|
return @comparator.call(@index[i.doc], @index[j.doc])
|
95
95
|
end
|
96
96
|
end
|
data/lib/ferret/search/sort.rb
CHANGED
@@ -81,6 +81,7 @@ module Ferret::Search
|
|
81
81
|
reverse = false)
|
82
82
|
fields = [fields] unless fields.is_a?(Array)
|
83
83
|
@fields = fields
|
84
|
+
fields = fields.map {|field| field.is_a?(Symbol) ? field.to_s : field}
|
84
85
|
if fields[0].is_a?(String)
|
85
86
|
@fields = fields.map do |field|
|
86
87
|
SortField.new(field, {:sort_type => SortField::SortType::AUTO,
|
@@ -2,8 +2,8 @@ module Ferret::Search
|
|
2
2
|
# Abstract base class for sorting hits returned by a Query.
|
3
3
|
#
|
4
4
|
# This class should only be used if the other SortField types (SCORE, DOC,
|
5
|
-
# STRING,
|
6
|
-
# internal cache of values which could be quite large. The cache is an
|
5
|
+
# STRING, INTEGER, FLOAT) do not provide an adequate sorting. It maintains
|
6
|
+
# an internal cache of values which could be quite large. The cache is an
|
7
7
|
# array of Comparable, one for each document in the index. There is a
|
8
8
|
# distinct Comparable for each unique term in the field - if some documents
|
9
9
|
# have the same term in the field, the cache array will have entries which
|
@@ -37,7 +37,7 @@ module Ferret::Search
|
|
37
37
|
|
38
38
|
# Sort using term values as encoded Integers. Sort values are Integer
|
39
39
|
# and lower values are at the front.
|
40
|
-
|
40
|
+
INTEGER = SortType.new("int", lambda{|str| str.to_i})
|
41
41
|
|
42
42
|
# Sort using term values as encoded Floats. Sort values are Float and
|
43
43
|
# lower values are at the front.
|
@@ -62,7 +62,7 @@ module Ferret::Search
|
|
62
62
|
# comparator:: a proc used to compare two values from the index. You can
|
63
63
|
# also give this value to the SortType object that you pass.
|
64
64
|
def initialize(name = nil, args= {})
|
65
|
-
@name = name
|
65
|
+
@name = name.to_s if name
|
66
66
|
@sort_type = args[:sort_type]||SortType::AUTO
|
67
67
|
@reverse = args[:reverse]||false
|
68
68
|
@comparator = args[:comparator]||@sort_type.comparator
|
data/test/unit/index/tc_index.rb
CHANGED
@@ -3,6 +3,7 @@ require File.dirname(__FILE__) + "/../../test_helper"
|
|
3
3
|
|
4
4
|
class IndexTest < Test::Unit::TestCase
|
5
5
|
include Ferret::Index
|
6
|
+
include Ferret::Search
|
6
7
|
include Ferret::Analysis
|
7
8
|
include Ferret::Store
|
8
9
|
include Ferret::Document
|
@@ -448,6 +449,43 @@ class IndexTest < Test::Unit::TestCase
|
|
448
449
|
index.close
|
449
450
|
end
|
450
451
|
|
452
|
+
def test_sortby_date
|
453
|
+
data = [
|
454
|
+
{:content => "one", :date => "20051023"},
|
455
|
+
{:content => "two", :date => "19530315"},
|
456
|
+
{:content => "three four", :date => "19390912"},
|
457
|
+
{:content => "one", :date => "19770905"},
|
458
|
+
{:content => "two", :date => "19810831"},
|
459
|
+
{:content => "three", :date => "19790531"},
|
460
|
+
{:content => "one", :date => "19770725"},
|
461
|
+
{:content => "two", :date => "19751226"},
|
462
|
+
{:content => "three", :date => "19390912"}
|
463
|
+
]
|
464
|
+
index = Index.new(:analyzer => WhiteSpaceAnalyzer.new)
|
465
|
+
data.each { |doc|
|
466
|
+
document = Document.new
|
467
|
+
doc.each_pair do |key, value|
|
468
|
+
document << Field.new(key.to_s, value, Field::Store::YES, Field::Index::TOKENIZED)
|
469
|
+
end
|
470
|
+
index << document
|
471
|
+
}
|
472
|
+
sf_date = SortField.new("date", {:sort_type => SortField::SortType::INTEGER})
|
473
|
+
#top_docs = index.search("one", :sort => [sf_date, SortField::FIELD_SCORE])
|
474
|
+
top_docs = index.search("one", :sort => Sort.new("date"))
|
475
|
+
assert_equal(3, top_docs.size)
|
476
|
+
assert_equal("19770725", index[top_docs.score_docs[0].doc][:date])
|
477
|
+
assert_equal("19770905", index[top_docs.score_docs[1].doc][:date])
|
478
|
+
assert_equal("20051023", index[top_docs.score_docs[2].doc][:date])
|
479
|
+
top_docs = index.search("one two three four",
|
480
|
+
:sort => [sf_date, SortField::FIELD_SCORE])
|
481
|
+
assert_equal("19390912", index[top_docs.score_docs[0].doc][:date])
|
482
|
+
assert_equal("three four", index[top_docs.score_docs[0].doc][:content])
|
483
|
+
assert_equal("19390912", index[top_docs.score_docs[1].doc][:date])
|
484
|
+
assert_equal("three", index[top_docs.score_docs[1].doc][:content])
|
485
|
+
assert_equal("19530315", index[top_docs.score_docs[2].doc][:date])
|
486
|
+
index.close
|
487
|
+
end
|
488
|
+
|
451
489
|
def test_auto_flush
|
452
490
|
fs_path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
|
453
491
|
Dir[File.join(fs_path, "*")].each {|path| begin File.delete(path) rescue nil end}
|
@@ -17,6 +17,7 @@ class QueryParserTest < Test::Unit::TestCase
|
|
17
17
|
['field:"one <> <> <> three <>"', 'field:"one <> <> <> three"'],
|
18
18
|
['field:"one <> <> <> three|four|five <>"', 'field:"one <> <> <> three|four|five"'],
|
19
19
|
['field:"one|two three|four|five six|seven"', 'field:"one|two three|four|five six|seven"'],
|
20
|
+
['contents:"testing|trucks"', 'contents:"testing|trucks"'],
|
20
21
|
['[aaa bbb]', '[aaa bbb]'],
|
21
22
|
['{aaa bbb]', '{aaa bbb]'],
|
22
23
|
['field:[aaa bbb}', 'field:[aaa bbb}'],
|
@@ -47,6 +47,8 @@ class FuzzyQueryTest < Test::Unit::TestCase
|
|
47
47
|
add_doc("abbbb", iw)
|
48
48
|
add_doc("bbbbb", iw)
|
49
49
|
add_doc("ddddd", iw)
|
50
|
+
add_doc("ddddddddddddddddddddd", iw) # test max_distances problem
|
51
|
+
add_doc("aaaaaaaaaaaaaaaaaaaaaaa", iw) # test max_distances problem
|
50
52
|
#iw.optimize()
|
51
53
|
iw.close()
|
52
54
|
|
@@ -55,6 +57,7 @@ class FuzzyQueryTest < Test::Unit::TestCase
|
|
55
57
|
|
56
58
|
fq = FuzzyQuery.new(Term.new("field", "aaaaa"), FuzzyQuery.default_min_similarity, 5)
|
57
59
|
|
60
|
+
do_prefix_test(is, "aaaaaaaaaaaaaaaaaaaaaa", 1, [8])
|
58
61
|
do_prefix_test(is, "aaaaa", 0, [0,1,2])
|
59
62
|
do_prefix_test(is, "aaaaa", 1, [0,1,2])
|
60
63
|
do_prefix_test(is, "aaaaa", 2, [0,1,2])
|
@@ -139,9 +139,6 @@ class IndexSearcherTest < Test::Unit::TestCase
|
|
139
139
|
pq << t1 << t2 << t3
|
140
140
|
check_hits(pq, [1])
|
141
141
|
|
142
|
-
pq.slop = 4
|
143
|
-
check_hits(pq, [1,16,17])
|
144
|
-
|
145
142
|
pq = PhraseQuery.new()
|
146
143
|
pq << t1
|
147
144
|
pq.add(t3, 2)
|
@@ -154,6 +151,23 @@ class IndexSearcherTest < Test::Unit::TestCase
|
|
154
151
|
check_hits(pq, [1,11,14,16,17])
|
155
152
|
end
|
156
153
|
|
154
|
+
def test_multi_phrase_query()
|
155
|
+
pq = MultiPhraseQuery.new()
|
156
|
+
t1 = Term.new("field", "quick")
|
157
|
+
t2 = Term.new("field", "brown")
|
158
|
+
t3 = Term.new("field", "fox")
|
159
|
+
pq << t1
|
160
|
+
pq << t2
|
161
|
+
pq << t3
|
162
|
+
check_hits(pq, [1])
|
163
|
+
|
164
|
+
t1b = Term.new("field", "fast")
|
165
|
+
pq.add(t1b, 0)
|
166
|
+
check_hits(pq, [1, 8])
|
167
|
+
end
|
168
|
+
|
169
|
+
|
170
|
+
|
157
171
|
def test_range_query()
|
158
172
|
rq = RangeQuery.new("date", "20051006", "20051010", true, true)
|
159
173
|
check_hits(rq, [6,7,8,9,10])
|
@@ -18,7 +18,7 @@ class SearchAndSortTest < Test::Unit::TestCase
|
|
18
18
|
|
19
19
|
def setup()
|
20
20
|
@dir = RAMDirectory.new()
|
21
|
-
iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
|
21
|
+
iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true, :min_merge_docs => 3)
|
22
22
|
docs = [ # len mod
|
23
23
|
{"search"=>"findall","string"=>"a","int"=>"6","float"=>"0.01"}, # 4 0
|
24
24
|
{"search"=>"findall","string"=>"c","int"=>"5","float"=>"0.1"}, # 3 3
|
@@ -56,16 +56,16 @@ class SearchAndSortTest < Test::Unit::TestCase
|
|
56
56
|
do_test_top_docs(is, q, [0,1,2,3,4,5,6,7,8,9], [SortField::FIELD_DOC])
|
57
57
|
|
58
58
|
## int
|
59
|
-
sf_int = SortField.new("int", {:sort_type => SortField::SortType::
|
59
|
+
sf_int = SortField.new("int", {:sort_type => SortField::SortType::INTEGER, :reverse => true})
|
60
60
|
do_test_top_docs(is, q, [0,1,6,5,9,4,8,2,7,3], [sf_int])
|
61
61
|
do_test_top_docs(is, q, [0,1,6,5,9,8,4,7,2,3], [sf_int, SortField::FIELD_SCORE])
|
62
|
-
sf_int = SortField.new("int", {:sort_type => SortField::SortType::
|
62
|
+
sf_int = SortField.new("int", {:sort_type => SortField::SortType::INTEGER})
|
63
63
|
do_test_top_docs(is, q, [3,2,7,4,8,5,9,1,6,0], [sf_int])
|
64
64
|
|
65
65
|
## float
|
66
|
-
sf_float = SortField.new("float", {:sort_type => SortField::SortType::FLOAT})
|
67
|
-
do_test_top_docs(is, q, [8,7,5,3,1,0,2,4,6,9], Sort.new([sf_float, SortField::FIELD_SCORE]))
|
68
66
|
sf_float = SortField.new("float", {:sort_type => SortField::SortType::FLOAT, :reverse => true})
|
67
|
+
do_test_top_docs(is, q, [8,7,5,3,1,0,2,4,6,9], Sort.new([sf_float, SortField::FIELD_SCORE]))
|
68
|
+
sf_float = SortField.new("float", {:sort_type => SortField::SortType::FLOAT})
|
69
69
|
do_test_top_docs(is, q, [9,6,4,2,0,1,3,5,7,8], Sort.new([sf_float, SortField::FIELD_SCORE]))
|
70
70
|
|
71
71
|
## str
|
@@ -74,11 +74,11 @@ class SearchAndSortTest < Test::Unit::TestCase
|
|
74
74
|
|
75
75
|
## auto
|
76
76
|
do_test_top_docs(is, q, [0,9,1,8,2,7,3,6,4,5], Sort.new("string"))
|
77
|
-
do_test_top_docs(is, q, [
|
78
|
-
do_test_top_docs(is, q, [
|
79
|
-
do_test_top_docs(is, q, [
|
80
|
-
do_test_top_docs(is, q, [0,1,
|
81
|
-
do_test_top_docs(is, q, [3,7,
|
77
|
+
do_test_top_docs(is, q, [3,2,7,4,8,5,9,1,6,0], Sort.new(["int"]))
|
78
|
+
do_test_top_docs(is, q, [9,6,4,2,0,1,3,5,7,8], Sort.new("float"))
|
79
|
+
do_test_top_docs(is, q, [8,7,5,3,1,0,2,4,6,9], Sort.new("float", true))
|
80
|
+
do_test_top_docs(is, q, [0,6,1,5,9,4,8,7,2,3], Sort.new(["int", "string"], true))
|
81
|
+
do_test_top_docs(is, q, [3,2,7,8,4,9,5,1,6,0], Sort.new(["int", "string"]))
|
82
82
|
end
|
83
83
|
|
84
84
|
LENGTH = SortField::SortType.new("length", lambda{|str| str.length})
|
@@ -87,11 +87,12 @@ class SearchAndSortTest < Test::Unit::TestCase
|
|
87
87
|
def test_special_sorts
|
88
88
|
is = IndexSearcher.new(@dir)
|
89
89
|
q = TermQuery.new(Term.new("search", "findall"))
|
90
|
-
sf = SortField.new("float", {:sort_type => LENGTH})
|
90
|
+
sf = SortField.new("float", {:sort_type => LENGTH, :reverse => true})
|
91
91
|
do_test_top_docs(is, q, [9,6,4,8,2,7,0,5,1,3], [sf])
|
92
|
-
sf = SortField.new("float", {:sort_type => LENGTH_MODULO})
|
92
|
+
sf = SortField.new("float", {:sort_type => LENGTH_MODULO, :reverse => true})
|
93
93
|
do_test_top_docs(is, q, [1,3,6,4,8,2,7,0,5,9], [sf])
|
94
94
|
sf = SortField.new("float", {:sort_type => LENGTH,
|
95
|
+
:reverse => true,
|
95
96
|
:comparator => lambda{|i,j| (j%4) <=> (i%4)}})
|
96
97
|
do_test_top_docs(is, q, [0,5,9,2,7,4,8,1,3,6], [sf])
|
97
98
|
end
|
data/test/unit/search/tc_sort.rb
CHANGED
@@ -32,14 +32,14 @@ class SortTest < Test::Unit::TestCase
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def test_multi_fields()
|
35
|
-
sf1 = SortField.new("field", {:sort_type => SortField::SortType::
|
35
|
+
sf1 = SortField.new("field", {:sort_type => SortField::SortType::INTEGER,
|
36
36
|
:reverse => true})
|
37
37
|
sf2 = SortField::FIELD_SCORE
|
38
38
|
sf3 = SortField::FIELD_DOC
|
39
39
|
s = Sort.new([sf1, sf2, sf3])
|
40
40
|
|
41
41
|
assert_equal(3, s.fields.size)
|
42
|
-
assert_equal(SortField::SortType::
|
42
|
+
assert_equal(SortField::SortType::INTEGER, s.fields[0].sort_type)
|
43
43
|
assert_equal("field", s.fields[0].name)
|
44
44
|
assert(s.fields[0].reverse?)
|
45
45
|
assert_equal(SortField::FIELD_SCORE, s.fields[1])
|
@@ -21,7 +21,7 @@ class SortFieldTest < Test::Unit::TestCase
|
|
21
21
|
|
22
22
|
def test_error_raised()
|
23
23
|
assert_raise(ArgumentError) {
|
24
|
-
fs = SortField.new(nil, {:sort_type => SortField::SortType::
|
24
|
+
fs = SortField.new(nil, {:sort_type => SortField::SortType::INTEGER})
|
25
25
|
}
|
26
26
|
end
|
27
27
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ferret
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2005-12-
|
6
|
+
version: 0.3.2
|
7
|
+
date: 2005-12-16 00:00:00 +09:00
|
8
8
|
summary: Ruby indexing library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|