ferret 0.11.6 → 0.11.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +10 -22
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +379 -274
- data/TODO +100 -8
- data/bin/ferret-browser +0 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/{api.c → STEMMER_api.c} +7 -10
- data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
- data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
- data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
- data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
- data/ext/analysis.c +276 -121
- data/ext/analysis.h +190 -143
- data/ext/api.h +3 -4
- data/ext/array.c +5 -3
- data/ext/array.h +52 -43
- data/ext/bitvector.c +38 -482
- data/ext/bitvector.h +446 -124
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +23 -22
- data/ext/config.h +21 -11
- data/ext/document.c +43 -40
- data/ext/document.h +31 -21
- data/ext/except.c +20 -38
- data/ext/except.h +89 -76
- data/ext/extconf.rb +3 -2
- data/ext/ferret.c +49 -35
- data/ext/ferret.h +14 -11
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +11 -10
- data/ext/fs_store.c +65 -47
- data/ext/global.c +245 -165
- data/ext/global.h +252 -54
- data/ext/hash.c +200 -243
- data/ext/hash.h +205 -163
- data/ext/hashset.c +118 -96
- data/ext/hashset.h +110 -82
- data/ext/header.h +19 -19
- data/ext/helper.c +11 -10
- data/ext/helper.h +14 -6
- data/ext/index.c +745 -366
- data/ext/index.h +503 -529
- data/ext/internal.h +1020 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +35 -15
- data/ext/mempool.c +5 -4
- data/ext/mempool.h +30 -22
- data/ext/modules.h +35 -7
- data/ext/multimapper.c +43 -2
- data/ext/multimapper.h +32 -23
- data/ext/posh.c +0 -0
- data/ext/posh.h +4 -38
- data/ext/priorityqueue.c +10 -12
- data/ext/priorityqueue.h +33 -21
- data/ext/q_boolean.c +22 -9
- data/ext/q_const_score.c +3 -2
- data/ext/q_filtered_query.c +15 -12
- data/ext/q_fuzzy.c +147 -135
- data/ext/q_match_all.c +3 -2
- data/ext/q_multi_term.c +28 -32
- data/ext/q_parser.c +451 -173
- data/ext/q_phrase.c +158 -79
- data/ext/q_prefix.c +16 -18
- data/ext/q_range.c +363 -31
- data/ext/q_span.c +130 -141
- data/ext/q_term.c +21 -21
- data/ext/q_wildcard.c +19 -23
- data/ext/r_analysis.c +369 -242
- data/ext/r_index.c +421 -434
- data/ext/r_qparser.c +142 -92
- data/ext/r_search.c +790 -407
- data/ext/r_store.c +44 -44
- data/ext/r_utils.c +264 -96
- data/ext/ram_store.c +29 -23
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +210 -87
- data/ext/search.h +556 -488
- data/ext/similarity.c +17 -16
- data/ext/similarity.h +51 -44
- data/ext/sort.c +157 -354
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +287 -278
- data/ext/store.c +57 -51
- data/ext/store.h +308 -286
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +14 -293
- data/ext/threading.h +22 -22
- data/ext/win32.h +12 -4
- data/lib/ferret.rb +2 -1
- data/lib/ferret/browser.rb +1 -1
- data/lib/ferret/field_symbol.rb +94 -0
- data/lib/ferret/index.rb +221 -34
- data/lib/ferret/number_tools.rb +6 -6
- data/lib/ferret/version.rb +3 -0
- data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
- data/test/test_helper.rb +7 -2
- data/test/test_installed.rb +1 -0
- data/test/threading/thread_safety_index_test.rb +10 -1
- data/test/threading/thread_safety_read_write_test.rb +4 -7
- data/test/threading/thread_safety_test.rb +0 -0
- data/test/unit/analysis/tc_analyzer.rb +29 -27
- data/test/unit/analysis/tc_token_stream.rb +23 -16
- data/test/unit/index/tc_index.rb +116 -11
- data/test/unit/index/tc_index_reader.rb +27 -27
- data/test/unit/index/tc_index_writer.rb +10 -0
- data/test/unit/index/th_doc.rb +38 -21
- data/test/unit/search/tc_filter.rb +31 -10
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/search/tm_searcher.rb +53 -1
- data/test/unit/store/tc_fs_store.rb +40 -2
- data/test/unit/store/tc_ram_store.rb +0 -0
- data/test/unit/store/tm_store.rb +0 -0
- data/test/unit/store/tm_store_lock.rb +7 -6
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +0 -0
- data/test/unit/ts_index.rb +0 -0
- data/test/unit/ts_store.rb +0 -0
- data/test/unit/ts_utils.rb +0 -0
- data/test/unit/utils/tc_number_tools.rb +0 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +262 -221
- data/ext/inc/lang.h +0 -48
- data/ext/inc/threading.h +0 -31
- data/ext/stem_ISO_8859_1_english.c +0 -1156
- data/ext/stem_ISO_8859_1_french.c +0 -1276
- data/ext/stem_ISO_8859_1_italian.c +0 -1091
- data/ext/stem_ISO_8859_1_norwegian.c +0 -296
- data/ext/stem_ISO_8859_1_spanish.c +0 -1119
- data/ext/stem_ISO_8859_1_swedish.c +0 -307
- data/ext/stem_UTF_8_danish.c +0 -344
- data/ext/stem_UTF_8_english.c +0 -1176
- data/ext/stem_UTF_8_french.c +0 -1296
- data/ext/stem_UTF_8_italian.c +0 -1113
- data/ext/stem_UTF_8_norwegian.c +0 -302
- data/ext/stem_UTF_8_portuguese.c +0 -1055
- data/ext/stem_UTF_8_russian.c +0 -709
- data/ext/stem_UTF_8_spanish.c +0 -1137
- data/ext/stem_UTF_8_swedish.c +0 -313
- data/lib/ferret_version.rb +0 -3
data/test/unit/index/tc_index.rb
CHANGED
@@ -353,7 +353,7 @@ class IndexTest < Test::Unit::TestCase
|
|
353
353
|
assert_equal(0, top_docs.hits.size)
|
354
354
|
|
355
355
|
iw = IndexWriter.new(:path => fs_path, :analyzer => WhiteSpaceAnalyzer.new)
|
356
|
-
iw << {:f
|
356
|
+
iw << {:f => "content3"}
|
357
357
|
iw.close()
|
358
358
|
|
359
359
|
top_docs = index.search("content3")
|
@@ -462,6 +462,95 @@ class IndexTest < Test::Unit::TestCase
|
|
462
462
|
index.close
|
463
463
|
end
|
464
464
|
|
465
|
+
def test_index_key_batch0
|
466
|
+
data = {
|
467
|
+
"0" => {:id => "0", :val => "one"},
|
468
|
+
"0" => {:id => "0", :val => "two"},
|
469
|
+
"1" =>{:id => "1", :val => "three"},
|
470
|
+
"1" => {:id => "1", :val => "four"},
|
471
|
+
}
|
472
|
+
|
473
|
+
index = Index.new(:analyzer => WhiteSpaceAnalyzer.new,
|
474
|
+
:key => :id)
|
475
|
+
index.batch_update data
|
476
|
+
assert_equal(2, index.size)
|
477
|
+
index.close
|
478
|
+
end
|
479
|
+
|
480
|
+
def test_index_key_batch1
|
481
|
+
data0 = {
|
482
|
+
"0" => {:id => "0", :val => "one"},
|
483
|
+
"0" => {:id => "0", :val => "two"},
|
484
|
+
"1" =>{:id => "1", :val => "three"},
|
485
|
+
"2" => {:id => "1", :val => "four"},
|
486
|
+
}
|
487
|
+
|
488
|
+
data1 = {
|
489
|
+
"0" => {:id => "0", :val => "one"},
|
490
|
+
"3" => {:id => "3", :val => "two"},
|
491
|
+
"2" =>{:id => "2", :val => "three"},
|
492
|
+
"1" => {:id => "1", :val => "four"},
|
493
|
+
"4" => {:id => "4", :val => "four"},
|
494
|
+
}
|
495
|
+
|
496
|
+
index = Index.new(:analyzer => WhiteSpaceAnalyzer.new,
|
497
|
+
:key => :id)
|
498
|
+
index.batch_update data0
|
499
|
+
assert_equal(3, index.size)
|
500
|
+
index.batch_update data1
|
501
|
+
assert_equal(5, index.size)
|
502
|
+
index.close
|
503
|
+
end
|
504
|
+
|
505
|
+
def test_index_key_delete_batch0
|
506
|
+
data0 = {
|
507
|
+
"0" => {:id => "0", :val => "one"},
|
508
|
+
"0" => {:id => "0", :val => "two"},
|
509
|
+
"1" =>{:id => "1", :val => "three"},
|
510
|
+
"2" => {:id => "2", :val => "four"},
|
511
|
+
"0" => {:id => "0", :val => "four"},
|
512
|
+
}
|
513
|
+
|
514
|
+
data1 = ["0", "1"];
|
515
|
+
|
516
|
+
index = Index.new(:analyzer => WhiteSpaceAnalyzer.new, :key => :id)
|
517
|
+
index.batch_update data0
|
518
|
+
|
519
|
+
assert_equal("four", index["0"][:val])
|
520
|
+
assert_equal("three", index["1"][:val])
|
521
|
+
assert_equal("four", index["2"][:val])
|
522
|
+
|
523
|
+
assert_equal(3, index.size)
|
524
|
+
index.delete data1
|
525
|
+
assert_equal(1, index.size)
|
526
|
+
assert_equal("four", index["2"][:val])
|
527
|
+
|
528
|
+
index.close
|
529
|
+
end
|
530
|
+
|
531
|
+
def test_index_key_delete_batch0
|
532
|
+
index = Index.new(:analyzer => WhiteSpaceAnalyzer.new)
|
533
|
+
1000.times {|i| index << {:id => "#{i}", :content => "content #{i}"}}
|
534
|
+
assert_equal(1000, index.size)
|
535
|
+
assert_equal("content 876", index['876'][:content])
|
536
|
+
|
537
|
+
new_docs = Array.new(1000) {|i| {:id => i, :content => "#{i} > content"}}
|
538
|
+
index.batch_update(new_docs)
|
539
|
+
assert_equal(1000, index.size)
|
540
|
+
assert_equal("128 > content", index['128'][:content])
|
541
|
+
|
542
|
+
new_docs = Array.new(1000) {|i| {:id => i.to_s, :content => "_(#{i})_"}}
|
543
|
+
index.batch_update(new_docs)
|
544
|
+
assert_equal(1000, index.size)
|
545
|
+
assert_equal("_(287)_", index['287'][:content])
|
546
|
+
|
547
|
+
new_docs = {}
|
548
|
+
1000.times {|i| new_docs[i.to_s] = {:id => i, :content => "Hash(#{i})"}}
|
549
|
+
index.batch_update(new_docs)
|
550
|
+
assert_equal(1000, index.size)
|
551
|
+
assert_equal("Hash(78)", index['78'][:content])
|
552
|
+
end
|
553
|
+
|
465
554
|
def test_index_multi_key
|
466
555
|
index = Index.new(:analyzer => WhiteSpaceAnalyzer.new,
|
467
556
|
:key => [:id, :table])
|
@@ -555,19 +644,23 @@ class IndexTest < Test::Unit::TestCase
|
|
555
644
|
|
556
645
|
index.close
|
557
646
|
end
|
558
|
-
|
647
|
+
|
648
|
+
# this test has been corrected to work as intended
|
649
|
+
# it now fails the same way on both 1.8 and 1.9 -- sds
|
559
650
|
def test_auto_flush
|
560
651
|
fs_path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
|
561
652
|
Dir[File.join(fs_path, "*")].each {|path| begin File.delete(path) rescue nil end}
|
562
653
|
|
563
|
-
data = %
|
654
|
+
data = %w(one two three four five six seven eight nine ten eleven twelve)
|
564
655
|
index1 = Index.new(:path => fs_path, :auto_flush => true, :key => :id)
|
565
|
-
index1 << "zero"
|
656
|
+
index1 << {:id => 0, :content => "zero"}
|
566
657
|
index2 = Index.new(:path => fs_path, :auto_flush => true)
|
567
658
|
begin
|
659
|
+
n = 1
|
568
660
|
data.each do |datum|
|
569
|
-
index1 << {:id =>
|
570
|
-
index2 << {:id =>
|
661
|
+
index1 << {:id => n, :content => datum}
|
662
|
+
index2 << {:id => n, :content => datum}
|
663
|
+
n += 1
|
571
664
|
end
|
572
665
|
5.times do |i|
|
573
666
|
index1.delete(i)
|
@@ -593,9 +686,9 @@ class IndexTest < Test::Unit::TestCase
|
|
593
686
|
|
594
687
|
# Note: Adding keywords to either field1 or field2 gets rid of the error
|
595
688
|
|
596
|
-
index << {:field1
|
597
|
-
index << {:field2
|
598
|
-
index << {:field3
|
689
|
+
index << {:field1 => ''}
|
690
|
+
index << {:field2 => ''}
|
691
|
+
index << {:field3 => 'foo bar baz'}
|
599
692
|
|
600
693
|
index.flush
|
601
694
|
index.close
|
@@ -644,7 +737,7 @@ class IndexTest < Test::Unit::TestCase
|
|
644
737
|
end
|
645
738
|
|
646
739
|
def test_wildcard
|
647
|
-
|
740
|
+
j = nil
|
648
741
|
Ferret::I.new do |i|
|
649
742
|
i << "one"
|
650
743
|
assert_equal(1, i.search("*").total_hits)
|
@@ -654,8 +747,9 @@ class IndexTest < Test::Unit::TestCase
|
|
654
747
|
assert_equal(3, i.search("*").total_hits)
|
655
748
|
assert_equal(3, i.search("id:*").total_hits)
|
656
749
|
assert_equal(2, i.search('id:?*').total_hits)
|
750
|
+
j = i
|
657
751
|
end
|
658
|
-
assert_raise(StandardError) {
|
752
|
+
assert_raise(StandardError) {j.close}
|
659
753
|
end
|
660
754
|
|
661
755
|
def check_highlight(index, q, excerpt_length, num_excerpts, expected, field = :field)
|
@@ -759,4 +853,15 @@ class IndexTest < Test::Unit::TestCase
|
|
759
853
|
index.query_delete('id:one')
|
760
854
|
assert_equal(20, index.size)
|
761
855
|
end
|
856
|
+
|
857
|
+
def test_query_update_delete_more_than_ten
|
858
|
+
index = Ferret::I.new
|
859
|
+
20.times {|i| index << {:id => i, :find => 'match', :change => 'one'} }
|
860
|
+
|
861
|
+
assert_equal(20, index.search('find:match').total_hits)
|
862
|
+
index.query_update('find:match', {:change => 'two'})
|
863
|
+
assert_equal(20, index.search('find:match AND change:two').total_hits)
|
864
|
+
index.query_delete('find:match')
|
865
|
+
assert_equal(0, index.size)
|
866
|
+
end
|
762
867
|
end
|
@@ -191,10 +191,10 @@ module IndexReaderCommon
|
|
191
191
|
def do_test_term_vectors()
|
192
192
|
expected_tv = TermVector.new(:body,
|
193
193
|
[
|
194
|
-
TVTerm.new("word1", [2, 4, 7]),
|
195
|
-
TVTerm.new("word2", [3]),
|
196
|
-
TVTerm.new("word3", [0, 5, 8, 9]),
|
197
|
-
TVTerm.new("word4", [1, 6])
|
194
|
+
TVTerm.new("word1", 3, [2, 4, 7]),
|
195
|
+
TVTerm.new("word2", 1, [3]),
|
196
|
+
TVTerm.new("word3", 4, [0, 5, 8, 9]),
|
197
|
+
TVTerm.new("word4", 2, [1, 6])
|
198
198
|
],
|
199
199
|
[*(0...10)].collect {|i| TVOffsets.new(i*6, (i+1)*6 - 1)})
|
200
200
|
|
@@ -209,13 +209,13 @@ module IndexReaderCommon
|
|
209
209
|
|
210
210
|
tv = tvs[:author]
|
211
211
|
assert_equal(:author, tv.field)
|
212
|
-
assert_equal([TVTerm.new("Leo", [0]), TVTerm.new("Tolstoy", [1])], tv.terms)
|
212
|
+
assert_equal([TVTerm.new("Leo", 1, [0]), TVTerm.new("Tolstoy", 1, [1])], tv.terms)
|
213
213
|
assert(tv.offsets.nil?)
|
214
214
|
|
215
215
|
|
216
216
|
tv = tvs[:title]
|
217
217
|
assert_equal(:title, tv.field)
|
218
|
-
assert_equal([TVTerm.new("War And Peace", nil)], tv.terms)
|
218
|
+
assert_equal([TVTerm.new("War And Peace", 1, nil)], tv.terms)
|
219
219
|
assert_equal([TVOffsets.new(0, 13)], tv.offsets)
|
220
220
|
end
|
221
221
|
|
@@ -254,19 +254,19 @@ module IndexReaderCommon
|
|
254
254
|
|
255
255
|
norms = @ir.norms(:text)
|
256
256
|
|
257
|
-
assert_equal(202, norms[ 3])
|
258
|
-
assert_equal( 20, norms[25])
|
259
|
-
assert_equal(200, norms[50])
|
260
|
-
assert_equal(155, norms[63])
|
257
|
+
assert_equal(202, norms.bytes.to_a[ 3])
|
258
|
+
assert_equal( 20, norms.bytes.to_a[25])
|
259
|
+
assert_equal(200, norms.bytes.to_a[50])
|
260
|
+
assert_equal(155, norms.bytes.to_a[63])
|
261
261
|
|
262
262
|
norms = @ir.norms(:title)
|
263
|
-
assert_equal(1, norms[3])
|
263
|
+
assert_equal(1, norms.bytes.to_a[3])
|
264
264
|
|
265
265
|
norms = @ir.norms(:body)
|
266
|
-
assert_equal(12, norms[3])
|
266
|
+
assert_equal(12, norms.bytes.to_a[3])
|
267
267
|
|
268
268
|
norms = @ir.norms(:author)
|
269
|
-
assert_equal(145, norms[3])
|
269
|
+
assert_equal(145, norms.bytes.to_a[3])
|
270
270
|
|
271
271
|
norms = @ir.norms(:year)
|
272
272
|
# TODO: this returns two possible results depending on whether it is
|
@@ -277,10 +277,10 @@ module IndexReaderCommon
|
|
277
277
|
|
278
278
|
norms = " " * 164
|
279
279
|
@ir.get_norms_into(:text, norms, 100)
|
280
|
-
assert_equal(202, norms[103])
|
281
|
-
assert_equal( 20, norms[125])
|
282
|
-
assert_equal(200, norms[150])
|
283
|
-
assert_equal(155, norms[163])
|
280
|
+
assert_equal(202, norms.bytes.to_a[103])
|
281
|
+
assert_equal( 20, norms.bytes.to_a[125])
|
282
|
+
assert_equal(200, norms.bytes.to_a[150])
|
283
|
+
assert_equal(155, norms.bytes.to_a[163])
|
284
284
|
|
285
285
|
@ir.commit()
|
286
286
|
|
@@ -290,10 +290,10 @@ module IndexReaderCommon
|
|
290
290
|
|
291
291
|
norms = " " * 164
|
292
292
|
ir2.get_norms_into(:text, norms, 100)
|
293
|
-
assert_equal(202, norms[103])
|
294
|
-
assert_equal( 20, norms[125])
|
295
|
-
assert_equal(200, norms[150])
|
296
|
-
assert_equal(155, norms[163])
|
293
|
+
assert_equal(202, norms.bytes.to_a[103])
|
294
|
+
assert_equal( 20, norms.bytes.to_a[125])
|
295
|
+
assert_equal(200, norms.bytes.to_a[150])
|
296
|
+
assert_equal(155, norms.bytes.to_a[163])
|
297
297
|
ir2.close()
|
298
298
|
end
|
299
299
|
|
@@ -608,10 +608,10 @@ class IndexReaderTest < Test::Unit::TestCase
|
|
608
608
|
def do_test_term_vectors(ir)
|
609
609
|
expected_tv = TermVector.new(:body,
|
610
610
|
[
|
611
|
-
TVTerm.new("word1", [2, 4, 7]),
|
612
|
-
TVTerm.new("word2", [3]),
|
613
|
-
TVTerm.new("word3", [0, 5, 8, 9]),
|
614
|
-
TVTerm.new("word4", [1, 6])
|
611
|
+
TVTerm.new("word1", 3, [2, 4, 7]),
|
612
|
+
TVTerm.new("word2", 1, [3]),
|
613
|
+
TVTerm.new("word3", 4, [0, 5, 8, 9]),
|
614
|
+
TVTerm.new("word4", 2, [1, 6])
|
615
615
|
],
|
616
616
|
[*(0...10)].collect {|i| TVOffsets.new(i*6, (i+1)*6 - 1)})
|
617
617
|
|
@@ -626,13 +626,13 @@ class IndexReaderTest < Test::Unit::TestCase
|
|
626
626
|
|
627
627
|
tv = tvs[:author]
|
628
628
|
assert_equal(:author, tv.field)
|
629
|
-
assert_equal([TVTerm.new("Leo", [0]), TVTerm.new("Tolstoy", [1])], tv.terms)
|
629
|
+
assert_equal([TVTerm.new("Leo", 1, [0]), TVTerm.new("Tolstoy", 1, [1])], tv.terms)
|
630
630
|
assert(tv.offsets.nil?)
|
631
631
|
|
632
632
|
|
633
633
|
tv = tvs[:title]
|
634
634
|
assert_equal(:title, tv.field)
|
635
|
-
assert_equal([TVTerm.new("War And Peace", nil)], tv.terms)
|
635
|
+
assert_equal([TVTerm.new("War And Peace", 1, nil)], tv.terms)
|
636
636
|
assert_equal([TVOffsets.new(0, 13)], tv.offsets)
|
637
637
|
end
|
638
638
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require File.dirname(__FILE__) + "/../../test_helper"
|
2
4
|
|
3
5
|
|
@@ -57,6 +59,14 @@ class IndexWriterTest < Test::Unit::TestCase
|
|
57
59
|
iw.close()
|
58
60
|
end
|
59
61
|
|
62
|
+
def test_adding_long_url
|
63
|
+
iw = IndexWriter.new(:dir => @dir,
|
64
|
+
:default_field => 'content')
|
65
|
+
iw << {:content => "http://" + 'x' * 255}
|
66
|
+
# The following line will cause a segfault prior to 0.11.6
|
67
|
+
iw << {:content => "http://" + 'x' * 1_000_000}
|
68
|
+
end
|
69
|
+
|
60
70
|
private
|
61
71
|
|
62
72
|
WORDS = [
|
data/test/unit/index/th_doc.rb
CHANGED
@@ -281,32 +281,49 @@ module IndexTestHelper
|
|
281
281
|
def self.prepare_search_docs
|
282
282
|
i = 1
|
283
283
|
[
|
284
|
-
["20050930", "cat1/",
|
285
|
-
|
286
|
-
["
|
287
|
-
|
288
|
-
["
|
289
|
-
|
290
|
-
["
|
291
|
-
|
292
|
-
["
|
293
|
-
|
294
|
-
["
|
295
|
-
|
296
|
-
["
|
297
|
-
|
298
|
-
["
|
299
|
-
|
300
|
-
["
|
301
|
-
"word1
|
302
|
-
["
|
303
|
-
"word1
|
304
|
-
|
284
|
+
["20050930", "cat1/", 0.123,
|
285
|
+
"word1" ],
|
286
|
+
["20051001", "cat1/sub1", 0.954,
|
287
|
+
"word1 word2 the quick brown fox" ],
|
288
|
+
["20051002", "cat1/sub1/subsub1", 908.125,
|
289
|
+
"word1 word3" ],
|
290
|
+
["20051003", "cat1/sub2", 3999,
|
291
|
+
"word1 word3" ],
|
292
|
+
["20051004", "cat1/sub2/subsub2", "+.3412",
|
293
|
+
"word1 word2" ],
|
294
|
+
["20051005", "cat2/sub1", -1.298,
|
295
|
+
"word1" ],
|
296
|
+
["20051006", "cat2/sub1", "2",
|
297
|
+
"word1 word3" ],
|
298
|
+
["20051007", "cat2/sub1", "+8.894",
|
299
|
+
"word1" ],
|
300
|
+
["20051008", "cat2/sub1", "+21235.2135",
|
301
|
+
"word1 word2 word3 the fast brown fox" ],
|
302
|
+
["20051009", "cat3/sub1", "10.0",
|
303
|
+
"word1" ],
|
304
|
+
["20051010", "cat3/sub1", 1,
|
305
|
+
"word1" ],
|
306
|
+
["20051011", "cat3/sub1", -12518419,
|
307
|
+
"word1 word3 the quick red fox" ],
|
308
|
+
["20051012", "cat3/sub1", "10",
|
309
|
+
"word1" ],
|
310
|
+
["20051013", "cat1/sub2", "15682954",
|
311
|
+
"word1" ],
|
312
|
+
["20051014", "cat1/sub1", "91239",
|
313
|
+
"word1 word3 the quick hairy fox" ],
|
314
|
+
["20051015", "cat1/sub2/subsub1", "-.89321",
|
315
|
+
"word1" ],
|
316
|
+
["20051016", "cat1/sub1/subsub2", -89,
|
317
|
+
"word1 the quick fox is brown and hairy and a little red" ],
|
318
|
+
["20051017", "cat1/", "-1.0",
|
319
|
+
"word1 the brown fox is quick and red" ]
|
320
|
+
].map do |date, category, number, field|
|
305
321
|
doc = Ferret::Document.new(i)
|
306
322
|
i += 1
|
307
323
|
doc[:date] = date
|
308
324
|
doc[:category] = category
|
309
325
|
doc[:field] = field
|
326
|
+
doc[:number] = number
|
310
327
|
doc
|
311
328
|
end
|
312
329
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
require 'date'
|
2
3
|
|
3
4
|
|
4
5
|
class FilterTest < Test::Unit::TestCase
|
@@ -39,16 +40,6 @@ class FilterTest < Test::Unit::TestCase
|
|
39
40
|
end
|
40
41
|
end
|
41
42
|
|
42
|
-
def test_filter_proc
|
43
|
-
searcher = Searcher.new(@dir)
|
44
|
-
q = MatchAllQuery.new()
|
45
|
-
filter_proc = lambda {|doc, score, s| (s[doc][:int] % 2) == 0}
|
46
|
-
top_docs = searcher.search(q, :filter_proc => filter_proc)
|
47
|
-
top_docs.hits.each do |hit|
|
48
|
-
assert_equal(0, searcher[hit.doc][:int] % 2)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
43
|
def test_range_filter
|
53
44
|
searcher = Searcher.new(@dir)
|
54
45
|
q = MatchAllQuery.new()
|
@@ -132,4 +123,34 @@ class FilterTest < Test::Unit::TestCase
|
|
132
123
|
filt = CustomFilter.new
|
133
124
|
do_test_top_docs(searcher, q, [0, 2, 4], filt)
|
134
125
|
end
|
126
|
+
|
127
|
+
def test_filter_proc
|
128
|
+
searcher = Searcher.new(@dir)
|
129
|
+
q = MatchAllQuery.new()
|
130
|
+
filter_proc = lambda {|doc, score, s| (s[doc][:int] % 2) == 0}
|
131
|
+
top_docs = searcher.search(q, :filter_proc => filter_proc)
|
132
|
+
top_docs.hits.each do |hit|
|
133
|
+
assert_equal(0, searcher[hit.doc][:int] % 2)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_score_modifying_filter_proc
|
138
|
+
searcher = Searcher.new(@dir)
|
139
|
+
q = MatchAllQuery.new()
|
140
|
+
start_date = Date.parse('2008-02-08')
|
141
|
+
date_half_life_50 = lambda do |doc, score, s|
|
142
|
+
days = (start_date - Date.parse(s[doc][:date], '%Y%m%d')).to_i
|
143
|
+
1.0 / (2.0 ** (days.to_f / 50.0))
|
144
|
+
end
|
145
|
+
top_docs = searcher.search(q, :filter_proc => date_half_life_50)
|
146
|
+
docs = top_docs.hits.collect {|hit| hit.doc}
|
147
|
+
assert_equal(docs, [2,4,9,8,6,3,5,1,7,0])
|
148
|
+
rev_date_half_life_50 = lambda do |doc, score, s|
|
149
|
+
days = (start_date - Date.parse(s[doc][:date], '%Y%m%d')).to_i
|
150
|
+
1.0 - 1.0 / (2.0 ** (days.to_f / 50.0))
|
151
|
+
end
|
152
|
+
top_docs = searcher.search(q, :filter_proc => rev_date_half_life_50)
|
153
|
+
docs = top_docs.hits.collect {|hit| hit.doc}
|
154
|
+
assert_equal(docs, [0,7,1,3,5,6,8,9,2,4])
|
155
|
+
end
|
135
156
|
end
|
@@ -50,6 +50,12 @@ class SearcherTest < Test::Unit::TestCase
|
|
50
50
|
assert(score_doc.score.approx_eql?(@searcher.explain(query, score_doc.doc).score),
|
51
51
|
"Scores(#{score_doc.score} != #{@searcher.explain(query, score_doc.doc).score})")
|
52
52
|
end
|
53
|
+
|
54
|
+
assert_equal(expected.sort, @searcher.scan(query))
|
55
|
+
if expected.size > 5
|
56
|
+
assert_equal(expected[0...5], @searcher.scan(query, :limit => 5))
|
57
|
+
assert_equal(expected[5..-1], @searcher.scan(query, :start_doc => expected[5]))
|
58
|
+
end
|
53
59
|
end
|
54
60
|
|
55
61
|
def test_get_doc()
|