ferret 0.10.11 → 0.10.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -0
- data/Rakefile +1 -1
- data/ext/analysis.c +62 -11
- data/ext/analysis.h +11 -0
- data/ext/bitvector.c +29 -18
- data/ext/{defines.h → config.h} +0 -0
- data/ext/except.h +1 -1
- data/ext/extconf.rb +2 -1
- data/ext/fs_store.c +4 -2
- data/ext/global.h +1 -1
- data/ext/hash.c +15 -12
- data/ext/hash.h +1 -0
- data/ext/helper.c +2 -2
- data/ext/helper.h +1 -1
- data/ext/index.c +4 -2
- data/ext/index.h +2 -2
- data/ext/{mem_pool.c → mempool.c} +1 -1
- data/ext/{mem_pool.h → mempool.h} +0 -0
- data/ext/multimapper.c +310 -0
- data/ext/multimapper.h +51 -0
- data/ext/r_analysis.c +200 -22
- data/ext/r_search.c +125 -15
- data/ext/search.c +1 -1
- data/ext/sort.c +1 -1
- data/ext/stopwords.c +2 -3
- data/lib/ferret/index.rb +2 -1
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +62 -0
- data/test/unit/index/tc_index.rb +19 -1
- data/test/unit/search/tc_search_and_sort.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +7 -0
- metadata +9 -7
data/ext/search.c
CHANGED
data/ext/sort.c
CHANGED
@@ -538,7 +538,7 @@ SortField *sort_field_auto_new(char *field, bool reverse)
|
|
538
538
|
|
539
539
|
void *field_cache_get_index(IndexReader *ir, SortField *sf)
|
540
540
|
{
|
541
|
-
void *index = NULL;
|
541
|
+
void *volatile index = NULL;
|
542
542
|
int length = 0;
|
543
543
|
TermEnum *volatile te = NULL;
|
544
544
|
TermDocEnum *volatile tde = NULL;
|
data/ext/stopwords.c
CHANGED
@@ -10,10 +10,9 @@
|
|
10
10
|
|
11
11
|
const char *ENGLISH_STOP_WORDS[] = {
|
12
12
|
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
|
13
|
-
"into", "is", "it", "no", "not", "of", "on", "or", "
|
14
|
-
"that",
|
13
|
+
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that",
|
15
14
|
"the", "their", "then", "there", "these", "they", "this", "to", "was",
|
16
|
-
"
|
15
|
+
"with", NULL
|
17
16
|
};
|
18
17
|
|
19
18
|
const char *FULL_ENGLISH_STOP_WORDS[] = {
|
data/lib/ferret/index.rb
CHANGED
@@ -527,6 +527,7 @@ module Ferret::Index
|
|
527
527
|
@searcher = nil
|
528
528
|
end
|
529
529
|
end
|
530
|
+
alias :commit :flush
|
530
531
|
|
531
532
|
# optimizes the index. This should only be called when the index will no
|
532
533
|
# longer be updated very often, but will be read a lot.
|
@@ -670,7 +671,7 @@ module Ferret::Index
|
|
670
671
|
latest = false
|
671
672
|
begin
|
672
673
|
latest = @reader.latest?
|
673
|
-
rescue
|
674
|
+
rescue LockError => le
|
674
675
|
sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again
|
675
676
|
latest = @reader.latest?
|
676
677
|
end
|
data/lib/ferret_version.rb
CHANGED
@@ -368,6 +368,68 @@ class RegExpTokenizerTest < Test::Unit::TestCase
|
|
368
368
|
end
|
369
369
|
end
|
370
370
|
|
371
|
+
class MappingFilterTest < Test::Unit::TestCase
|
372
|
+
include Ferret::Analysis
|
373
|
+
|
374
|
+
def test_mapping_filter()
|
375
|
+
mapping = {
|
376
|
+
['à','á','â','ã','ä','å','ā','ă'] => 'a',
|
377
|
+
'æ' => 'ae',
|
378
|
+
['ď','đ'] => 'd',
|
379
|
+
['ç','ć','č','ĉ','ċ'] => 'c',
|
380
|
+
['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
|
381
|
+
['ƒ'] => 'f',
|
382
|
+
['ĝ','ğ','ġ','ģ'] => 'g',
|
383
|
+
['ĥ','ħ'] => 'h',
|
384
|
+
['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
|
385
|
+
['į','ı','ij','ĵ'] => 'j',
|
386
|
+
['ķ','ĸ'] => 'k',
|
387
|
+
['ł','ľ','ĺ','ļ','ŀ'] => 'l',
|
388
|
+
['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
|
389
|
+
['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
|
390
|
+
'œ' => 'oek',
|
391
|
+
'ą' => 'q',
|
392
|
+
['ŕ','ř','ŗ'] => 'r',
|
393
|
+
['ś','š','ş','ŝ','ș'] => 's',
|
394
|
+
['ť','ţ','ŧ','ț'] => 't',
|
395
|
+
['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
|
396
|
+
'ŵ' => 'w',
|
397
|
+
['ý','ÿ','ŷ'] => 'y',
|
398
|
+
['ž','ż','ź'] => 'z'
|
399
|
+
}
|
400
|
+
input = <<END
|
401
|
+
aàáâãäåāăb cæd eďđf gçćčĉċh ièéêëēęěĕėj kƒl mĝğġģn oĥħp qììíîïīĩĭr sįıijĵt uķĸv
|
402
|
+
włľĺļŀx yñńňņʼnŋz aòóôõöøōőŏŏb cœd eąf gŕřŗh iśšşŝșj kťţŧțl mùúûüūůűŭũųn oŵp
|
403
|
+
qýÿŷr sžżźt
|
404
|
+
END
|
405
|
+
t = MappingFilter.new(LetterTokenizer.new(input), mapping)
|
406
|
+
assert_equal(Token.new('aaaaaaaaab', 0, 18), t.next)
|
407
|
+
assert_equal(Token.new('caed', 19, 23), t.next)
|
408
|
+
assert_equal(Token.new('eddf', 24, 30), t.next)
|
409
|
+
assert_equal(Token.new('gccccch', 31, 43), t.next)
|
410
|
+
assert_equal(Token.new('ieeeeeeeeej', 44, 64), t.next)
|
411
|
+
assert_equal(Token.new('kfl', 65, 69), t.next)
|
412
|
+
assert_equal(Token.new('mggggn', 70, 80), t.next)
|
413
|
+
assert_equal(Token.new('ohhp', 81, 87), t.next)
|
414
|
+
assert_equal(Token.new('qiiiiiiiir', 88, 106), t.next)
|
415
|
+
assert_equal(Token.new('sjjjjt', 107, 117), t.next)
|
416
|
+
assert_equal(Token.new('ukkv', 118, 124), t.next)
|
417
|
+
assert_equal(Token.new('wlllllx', 125, 137), t.next)
|
418
|
+
assert_equal(Token.new('ynnnnnnz', 138, 152), t.next)
|
419
|
+
assert_equal(Token.new('aoooooooooob', 153, 175), t.next)
|
420
|
+
assert_equal(Token.new('coekd', 176, 180), t.next)
|
421
|
+
assert_equal(Token.new('eqf', 181, 185), t.next)
|
422
|
+
assert_equal(Token.new('grrrh', 186, 194), t.next)
|
423
|
+
assert_equal(Token.new('isssssj', 195, 207), t.next)
|
424
|
+
assert_equal(Token.new('kttttl', 208, 218), t.next)
|
425
|
+
assert_equal(Token.new('muuuuuuuuuun', 219, 241), t.next)
|
426
|
+
assert_equal(Token.new('owp', 242, 246), t.next)
|
427
|
+
assert_equal(Token.new('qyyyr', 247, 255), t.next)
|
428
|
+
assert_equal(Token.new('szzzt', 256, 264), t.next)
|
429
|
+
assert(! t.next())
|
430
|
+
end
|
431
|
+
end if (/mswin/i !~ RUBY_PLATFORM)
|
432
|
+
|
371
433
|
class StopFilterTest < Test::Unit::TestCase
|
372
434
|
include Ferret::Analysis
|
373
435
|
|
data/test/unit/index/tc_index.rb
CHANGED
@@ -525,7 +525,7 @@ class IndexTest < Test::Unit::TestCase
|
|
525
525
|
{:content => "four", :date => "19390912"}
|
526
526
|
].each {|doc| index << doc}
|
527
527
|
|
528
|
-
sf_date = SortField.new("date", {:
|
528
|
+
sf_date = SortField.new("date", {:type => :integer})
|
529
529
|
#top_docs = index.search("one", :sort => [sf_date, SortField::SCORE])
|
530
530
|
top_docs = index.search("one", :sort => Sort.new("date"))
|
531
531
|
assert_equal(3, top_docs.total_hits)
|
@@ -773,5 +773,23 @@ class IndexTest < Test::Unit::TestCase
|
|
773
773
|
index.add_document({:content => "Content With Capitals"}, a)
|
774
774
|
tv = index.reader.term_vector(0, :content)
|
775
775
|
assert_equal("Capitals", tv.terms[0].text)
|
776
|
+
index.close
|
777
|
+
end
|
778
|
+
|
779
|
+
def test_top_doc_to_json
|
780
|
+
index = Ferret::I.new
|
781
|
+
[
|
782
|
+
{:f1 => "one"},
|
783
|
+
{:f2 => ["two",2,2.0]},
|
784
|
+
{:f3 => 3},
|
785
|
+
{:f4 => 4.0},
|
786
|
+
{:f5 => "five", :funny => '"' * 10_000}
|
787
|
+
].each {|doc| index << doc}
|
788
|
+
json_str = index.search("one two 3 4.0 five",
|
789
|
+
:sort => Ferret::Search::Sort::INDEX_ORDER).to_json
|
790
|
+
assert(json_str == '[{"f1":"one"},{"f2":["two","2","2.0"]},{"f3":"3"},{"f4":"4.0"},{"f5":"five","funny":"' + '\'"\'' * 10_000 + '"}]' ||
|
791
|
+
json_str == '[{"f1":"one"},{"f2":["two","2","2.0"]},{"f3":"3"},{"f4":"4.0"},{"funny":"' + '\'"\'' * 10_000 + '","f5":"five"}]')
|
792
|
+
assert_equal('[]', index.search("xxx").to_json)
|
793
|
+
index.close
|
776
794
|
end
|
777
795
|
end
|
@@ -128,7 +128,7 @@ class SearchAndSortTest < Test::Unit::TestCase
|
|
128
128
|
|
129
129
|
## byte
|
130
130
|
do_test_top_docs(is, q, [3,2,7,4,8,5,9,1,6,0],
|
131
|
-
|
131
|
+
SortField.new(:int, :type => :byte))
|
132
132
|
do_test_top_docs(is, q, [0,1,6,5,9,4,8,2,7,3],
|
133
133
|
[SortField.new(:int, :type => :byte, :reverse => true)])
|
134
134
|
|
@@ -133,6 +133,13 @@ class BitVectorTest < Test::Unit::TestCase
|
|
133
133
|
|
134
134
|
assert_equal(bv2, and_bv, "and_bv should be empty")
|
135
135
|
assert_equal(0, and_bv.count)
|
136
|
+
|
137
|
+
bv1 = BitVector.new
|
138
|
+
bv2 = BitVector.new.not!
|
139
|
+
bv1.set(10)
|
140
|
+
bv1.set(11)
|
141
|
+
bv1.set(20)
|
142
|
+
assert_equal(bv1, bv1 & bv2, "bv anded with empty not bv should be same")
|
136
143
|
end
|
137
144
|
|
138
145
|
def test_bv_or
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ferret
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.10.
|
7
|
-
date: 2006-10-
|
6
|
+
version: 0.10.12
|
7
|
+
date: 2006-10-20 00:00:00 +09:00
|
8
8
|
summary: Ruby indexing library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -55,7 +55,6 @@ files:
|
|
55
55
|
- ext/stopwords.c
|
56
56
|
- ext/array.c
|
57
57
|
- ext/index.c
|
58
|
-
- ext/mem_pool.c
|
59
58
|
- ext/compound_io.c
|
60
59
|
- ext/q_prefix.c
|
61
60
|
- ext/q_range.c
|
@@ -72,8 +71,10 @@ files:
|
|
72
71
|
- ext/q_parser.c
|
73
72
|
- ext/q_span.c
|
74
73
|
- ext/term_vectors.c
|
74
|
+
- ext/multimapper.c
|
75
|
+
- ext/mempool.c
|
75
76
|
- ext/priorityqueue.h
|
76
|
-
- ext/
|
77
|
+
- ext/mempool.h
|
77
78
|
- ext/posh.h
|
78
79
|
- ext/store.h
|
79
80
|
- ext/hashset.h
|
@@ -89,9 +90,10 @@ files:
|
|
89
90
|
- ext/win32.h
|
90
91
|
- ext/analysis.h
|
91
92
|
- ext/search.h
|
92
|
-
- ext/mem_pool.h
|
93
93
|
- ext/array.h
|
94
94
|
- ext/lang.h
|
95
|
+
- ext/config.h
|
96
|
+
- ext/multimapper.h
|
95
97
|
- ext/stem_UTF_8_norwegian.c
|
96
98
|
- ext/stem_UTF_8_danish.c
|
97
99
|
- ext/stem_UTF_8_dutch.c
|
@@ -149,10 +151,10 @@ files:
|
|
149
151
|
- ext/api.h
|
150
152
|
- ext/header.h
|
151
153
|
- ext/libstemmer.c
|
152
|
-
- ext/modules.h
|
153
|
-
- ext/libstemmer.h
|
154
154
|
- ext/ferret.h
|
155
155
|
- ext/ferret.c
|
156
|
+
- ext/modules.h
|
157
|
+
- ext/libstemmer.h
|
156
158
|
- ext/r_analysis.c
|
157
159
|
- ext/r_utils.c
|
158
160
|
- ext/r_store.c
|