ferret 0.10.11 → 0.10.12
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -0
- data/Rakefile +1 -1
- data/ext/analysis.c +62 -11
- data/ext/analysis.h +11 -0
- data/ext/bitvector.c +29 -18
- data/ext/{defines.h → config.h} +0 -0
- data/ext/except.h +1 -1
- data/ext/extconf.rb +2 -1
- data/ext/fs_store.c +4 -2
- data/ext/global.h +1 -1
- data/ext/hash.c +15 -12
- data/ext/hash.h +1 -0
- data/ext/helper.c +2 -2
- data/ext/helper.h +1 -1
- data/ext/index.c +4 -2
- data/ext/index.h +2 -2
- data/ext/{mem_pool.c → mempool.c} +1 -1
- data/ext/{mem_pool.h → mempool.h} +0 -0
- data/ext/multimapper.c +310 -0
- data/ext/multimapper.h +51 -0
- data/ext/r_analysis.c +200 -22
- data/ext/r_search.c +125 -15
- data/ext/search.c +1 -1
- data/ext/sort.c +1 -1
- data/ext/stopwords.c +2 -3
- data/lib/ferret/index.rb +2 -1
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +62 -0
- data/test/unit/index/tc_index.rb +19 -1
- data/test/unit/search/tc_search_and_sort.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +7 -0
- metadata +9 -7
data/ext/search.c
CHANGED
data/ext/sort.c
CHANGED
@@ -538,7 +538,7 @@ SortField *sort_field_auto_new(char *field, bool reverse)
|
|
538
538
|
|
539
539
|
void *field_cache_get_index(IndexReader *ir, SortField *sf)
|
540
540
|
{
|
541
|
-
void *index = NULL;
|
541
|
+
void *volatile index = NULL;
|
542
542
|
int length = 0;
|
543
543
|
TermEnum *volatile te = NULL;
|
544
544
|
TermDocEnum *volatile tde = NULL;
|
data/ext/stopwords.c
CHANGED
@@ -10,10 +10,9 @@
|
|
10
10
|
|
11
11
|
const char *ENGLISH_STOP_WORDS[] = {
|
12
12
|
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
|
13
|
-
"into", "is", "it", "no", "not", "of", "on", "or", "
|
14
|
-
"that",
|
13
|
+
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that",
|
15
14
|
"the", "their", "then", "there", "these", "they", "this", "to", "was",
|
16
|
-
"
|
15
|
+
"with", NULL
|
17
16
|
};
|
18
17
|
|
19
18
|
const char *FULL_ENGLISH_STOP_WORDS[] = {
|
data/lib/ferret/index.rb
CHANGED
@@ -527,6 +527,7 @@ module Ferret::Index
|
|
527
527
|
@searcher = nil
|
528
528
|
end
|
529
529
|
end
|
530
|
+
alias :commit :flush
|
530
531
|
|
531
532
|
# optimizes the index. This should only be called when the index will no
|
532
533
|
# longer be updated very often, but will be read a lot.
|
@@ -670,7 +671,7 @@ module Ferret::Index
|
|
670
671
|
latest = false
|
671
672
|
begin
|
672
673
|
latest = @reader.latest?
|
673
|
-
rescue
|
674
|
+
rescue LockError => le
|
674
675
|
sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again
|
675
676
|
latest = @reader.latest?
|
676
677
|
end
|
data/lib/ferret_version.rb
CHANGED
@@ -368,6 +368,68 @@ class RegExpTokenizerTest < Test::Unit::TestCase
|
|
368
368
|
end
|
369
369
|
end
|
370
370
|
|
371
|
+
class MappingFilterTest < Test::Unit::TestCase
|
372
|
+
include Ferret::Analysis
|
373
|
+
|
374
|
+
def test_mapping_filter()
|
375
|
+
mapping = {
|
376
|
+
['à','á','â','ã','ä','å','ā','ă'] => 'a',
|
377
|
+
'æ' => 'ae',
|
378
|
+
['ď','đ'] => 'd',
|
379
|
+
['ç','ć','č','ĉ','ċ'] => 'c',
|
380
|
+
['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
|
381
|
+
['ƒ'] => 'f',
|
382
|
+
['ĝ','ğ','ġ','ģ'] => 'g',
|
383
|
+
['ĥ','ħ'] => 'h',
|
384
|
+
['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
|
385
|
+
['į','ı','ij','ĵ'] => 'j',
|
386
|
+
['ķ','ĸ'] => 'k',
|
387
|
+
['ł','ľ','ĺ','ļ','ŀ'] => 'l',
|
388
|
+
['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
|
389
|
+
['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
|
390
|
+
'œ' => 'oek',
|
391
|
+
'ą' => 'q',
|
392
|
+
['ŕ','ř','ŗ'] => 'r',
|
393
|
+
['ś','š','ş','ŝ','ș'] => 's',
|
394
|
+
['ť','ţ','ŧ','ț'] => 't',
|
395
|
+
['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
|
396
|
+
'ŵ' => 'w',
|
397
|
+
['ý','ÿ','ŷ'] => 'y',
|
398
|
+
['ž','ż','ź'] => 'z'
|
399
|
+
}
|
400
|
+
input = <<END
|
401
|
+
aàáâãäåāăb cæd eďđf gçćčĉċh ièéêëēęěĕėj kƒl mĝğġģn oĥħp qììíîïīĩĭr sįıijĵt uķĸv
|
402
|
+
włľĺļŀx yñńňņʼnŋz aòóôõöøōőŏŏb cœd eąf gŕřŗh iśšşŝșj kťţŧțl mùúûüūůűŭũųn oŵp
|
403
|
+
qýÿŷr sžżźt
|
404
|
+
END
|
405
|
+
t = MappingFilter.new(LetterTokenizer.new(input), mapping)
|
406
|
+
assert_equal(Token.new('aaaaaaaaab', 0, 18), t.next)
|
407
|
+
assert_equal(Token.new('caed', 19, 23), t.next)
|
408
|
+
assert_equal(Token.new('eddf', 24, 30), t.next)
|
409
|
+
assert_equal(Token.new('gccccch', 31, 43), t.next)
|
410
|
+
assert_equal(Token.new('ieeeeeeeeej', 44, 64), t.next)
|
411
|
+
assert_equal(Token.new('kfl', 65, 69), t.next)
|
412
|
+
assert_equal(Token.new('mggggn', 70, 80), t.next)
|
413
|
+
assert_equal(Token.new('ohhp', 81, 87), t.next)
|
414
|
+
assert_equal(Token.new('qiiiiiiiir', 88, 106), t.next)
|
415
|
+
assert_equal(Token.new('sjjjjt', 107, 117), t.next)
|
416
|
+
assert_equal(Token.new('ukkv', 118, 124), t.next)
|
417
|
+
assert_equal(Token.new('wlllllx', 125, 137), t.next)
|
418
|
+
assert_equal(Token.new('ynnnnnnz', 138, 152), t.next)
|
419
|
+
assert_equal(Token.new('aoooooooooob', 153, 175), t.next)
|
420
|
+
assert_equal(Token.new('coekd', 176, 180), t.next)
|
421
|
+
assert_equal(Token.new('eqf', 181, 185), t.next)
|
422
|
+
assert_equal(Token.new('grrrh', 186, 194), t.next)
|
423
|
+
assert_equal(Token.new('isssssj', 195, 207), t.next)
|
424
|
+
assert_equal(Token.new('kttttl', 208, 218), t.next)
|
425
|
+
assert_equal(Token.new('muuuuuuuuuun', 219, 241), t.next)
|
426
|
+
assert_equal(Token.new('owp', 242, 246), t.next)
|
427
|
+
assert_equal(Token.new('qyyyr', 247, 255), t.next)
|
428
|
+
assert_equal(Token.new('szzzt', 256, 264), t.next)
|
429
|
+
assert(! t.next())
|
430
|
+
end
|
431
|
+
end if (/mswin/i !~ RUBY_PLATFORM)
|
432
|
+
|
371
433
|
class StopFilterTest < Test::Unit::TestCase
|
372
434
|
include Ferret::Analysis
|
373
435
|
|
data/test/unit/index/tc_index.rb
CHANGED
@@ -525,7 +525,7 @@ class IndexTest < Test::Unit::TestCase
|
|
525
525
|
{:content => "four", :date => "19390912"}
|
526
526
|
].each {|doc| index << doc}
|
527
527
|
|
528
|
-
sf_date = SortField.new("date", {:
|
528
|
+
sf_date = SortField.new("date", {:type => :integer})
|
529
529
|
#top_docs = index.search("one", :sort => [sf_date, SortField::SCORE])
|
530
530
|
top_docs = index.search("one", :sort => Sort.new("date"))
|
531
531
|
assert_equal(3, top_docs.total_hits)
|
@@ -773,5 +773,23 @@ class IndexTest < Test::Unit::TestCase
|
|
773
773
|
index.add_document({:content => "Content With Capitals"}, a)
|
774
774
|
tv = index.reader.term_vector(0, :content)
|
775
775
|
assert_equal("Capitals", tv.terms[0].text)
|
776
|
+
index.close
|
777
|
+
end
|
778
|
+
|
779
|
+
def test_top_doc_to_json
|
780
|
+
index = Ferret::I.new
|
781
|
+
[
|
782
|
+
{:f1 => "one"},
|
783
|
+
{:f2 => ["two",2,2.0]},
|
784
|
+
{:f3 => 3},
|
785
|
+
{:f4 => 4.0},
|
786
|
+
{:f5 => "five", :funny => '"' * 10_000}
|
787
|
+
].each {|doc| index << doc}
|
788
|
+
json_str = index.search("one two 3 4.0 five",
|
789
|
+
:sort => Ferret::Search::Sort::INDEX_ORDER).to_json
|
790
|
+
assert(json_str == '[{"f1":"one"},{"f2":["two","2","2.0"]},{"f3":"3"},{"f4":"4.0"},{"f5":"five","funny":"' + '\'"\'' * 10_000 + '"}]' ||
|
791
|
+
json_str == '[{"f1":"one"},{"f2":["two","2","2.0"]},{"f3":"3"},{"f4":"4.0"},{"funny":"' + '\'"\'' * 10_000 + '","f5":"five"}]')
|
792
|
+
assert_equal('[]', index.search("xxx").to_json)
|
793
|
+
index.close
|
776
794
|
end
|
777
795
|
end
|
@@ -128,7 +128,7 @@ class SearchAndSortTest < Test::Unit::TestCase
|
|
128
128
|
|
129
129
|
## byte
|
130
130
|
do_test_top_docs(is, q, [3,2,7,4,8,5,9,1,6,0],
|
131
|
-
|
131
|
+
SortField.new(:int, :type => :byte))
|
132
132
|
do_test_top_docs(is, q, [0,1,6,5,9,4,8,2,7,3],
|
133
133
|
[SortField.new(:int, :type => :byte, :reverse => true)])
|
134
134
|
|
@@ -133,6 +133,13 @@ class BitVectorTest < Test::Unit::TestCase
|
|
133
133
|
|
134
134
|
assert_equal(bv2, and_bv, "and_bv should be empty")
|
135
135
|
assert_equal(0, and_bv.count)
|
136
|
+
|
137
|
+
bv1 = BitVector.new
|
138
|
+
bv2 = BitVector.new.not!
|
139
|
+
bv1.set(10)
|
140
|
+
bv1.set(11)
|
141
|
+
bv1.set(20)
|
142
|
+
assert_equal(bv1, bv1 & bv2, "bv anded with empty not bv should be same")
|
136
143
|
end
|
137
144
|
|
138
145
|
def test_bv_or
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ferret
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.10.
|
7
|
-
date: 2006-10-
|
6
|
+
version: 0.10.12
|
7
|
+
date: 2006-10-20 00:00:00 +09:00
|
8
8
|
summary: Ruby indexing library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -55,7 +55,6 @@ files:
|
|
55
55
|
- ext/stopwords.c
|
56
56
|
- ext/array.c
|
57
57
|
- ext/index.c
|
58
|
-
- ext/mem_pool.c
|
59
58
|
- ext/compound_io.c
|
60
59
|
- ext/q_prefix.c
|
61
60
|
- ext/q_range.c
|
@@ -72,8 +71,10 @@ files:
|
|
72
71
|
- ext/q_parser.c
|
73
72
|
- ext/q_span.c
|
74
73
|
- ext/term_vectors.c
|
74
|
+
- ext/multimapper.c
|
75
|
+
- ext/mempool.c
|
75
76
|
- ext/priorityqueue.h
|
76
|
-
- ext/
|
77
|
+
- ext/mempool.h
|
77
78
|
- ext/posh.h
|
78
79
|
- ext/store.h
|
79
80
|
- ext/hashset.h
|
@@ -89,9 +90,10 @@ files:
|
|
89
90
|
- ext/win32.h
|
90
91
|
- ext/analysis.h
|
91
92
|
- ext/search.h
|
92
|
-
- ext/mem_pool.h
|
93
93
|
- ext/array.h
|
94
94
|
- ext/lang.h
|
95
|
+
- ext/config.h
|
96
|
+
- ext/multimapper.h
|
95
97
|
- ext/stem_UTF_8_norwegian.c
|
96
98
|
- ext/stem_UTF_8_danish.c
|
97
99
|
- ext/stem_UTF_8_dutch.c
|
@@ -149,10 +151,10 @@ files:
|
|
149
151
|
- ext/api.h
|
150
152
|
- ext/header.h
|
151
153
|
- ext/libstemmer.c
|
152
|
-
- ext/modules.h
|
153
|
-
- ext/libstemmer.h
|
154
154
|
- ext/ferret.h
|
155
155
|
- ext/ferret.c
|
156
|
+
- ext/modules.h
|
157
|
+
- ext/libstemmer.h
|
156
158
|
- ext/r_analysis.c
|
157
159
|
- ext/r_utils.c
|
158
160
|
- ext/r_store.c
|