ferret 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/TODO +3 -0
- data/ext/dummy.exe +0 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/token.rb +6 -0
- data/lib/ferret/analysis/tokenizers.rb +5 -5
- data/lib/ferret/document/document.rb +10 -13
- data/lib/ferret/index/compound_file_io.rb +12 -9
- data/lib/ferret/index/field_infos.rb +0 -6
- data/lib/ferret/index/index.rb +220 -102
- data/lib/ferret/index/index_reader.rb +22 -2
- data/lib/ferret/index/index_writer.rb +55 -14
- data/lib/ferret/index/multi_reader.rb +279 -279
- data/lib/ferret/index/segment_infos.rb +3 -3
- data/lib/ferret/index/segment_merger.rb +7 -6
- data/lib/ferret/index/segment_reader.rb +23 -7
- data/lib/ferret/index/segment_term_enum.rb +6 -7
- data/lib/ferret/index/term_buffer.rb +3 -5
- data/lib/ferret/index/term_doc_enum.rb +7 -2
- data/lib/ferret/index/term_infos_io.rb +15 -8
- data/lib/ferret/query_parser/query_parser.tab.rb +49 -45
- data/lib/ferret/search/boolean_query.rb +3 -4
- data/lib/ferret/search/boolean_scorer.rb +11 -11
- data/lib/ferret/search/caching_wrapper_filter.rb +1 -1
- data/lib/ferret/search/disjunction_sum_scorer.rb +9 -7
- data/lib/ferret/search/field_cache.rb +1 -2
- data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
- data/lib/ferret/search/fuzzy_term_enum.rb +64 -58
- data/lib/ferret/search/index_searcher.rb +16 -9
- data/lib/ferret/search/prefix_query.rb +7 -0
- data/lib/ferret/search/query_filter.rb +1 -1
- data/lib/ferret/search/term_scorer.rb +5 -1
- data/lib/ferret/search/top_docs.rb +12 -0
- data/lib/ferret/store/buffered_index_io.rb +5 -6
- data/lib/ferret/store/fs_store.rb +47 -33
- data/lib/ferret/store/ram_store.rb +2 -2
- data/lib/ferret/utils.rb +1 -0
- data/lib/ferret/utils/bit_vector.rb +20 -2
- data/lib/ferret/utils/thread_local.rb +28 -0
- data/lib/ferret/utils/weak_key_hash.rb +11 -2
- data/test/benchmark/tb_rw_vint.rb +1 -1
- data/test/functional/thread_safety_index_test.rb +81 -0
- data/test/functional/thread_safety_test.rb +137 -0
- data/test/test_all.rb +3 -7
- data/test/test_helper.rb +2 -1
- data/test/unit/index/tc_compound_file_io.rb +2 -2
- data/test/unit/index/tc_index.rb +128 -6
- data/test/unit/index/tc_index_reader.rb +1 -1
- data/test/unit/index/tc_segment_infos.rb +1 -1
- data/test/unit/index/th_doc.rb +1 -1
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/store/tc_fs_store.rb +3 -3
- data/test/unit/utils/tc_bit_vector.rb +8 -0
- data/test/unit/utils/tc_thread.rb +61 -0
- data/test/unit/utils/tc_weak_key_hash.rb +2 -2
- data/test/utils/number_to_spoken.rb +132 -0
- metadata +7 -2
@@ -597,7 +597,7 @@ class IndexReaderTest < Test::Unit::TestCase
|
|
597
597
|
def test_ir_read_while_optimizing_on_disk()
|
598
598
|
dpath = File.join(File.dirname(__FILE__),
|
599
599
|
'../../temp/fsdir')
|
600
|
-
fs_dir = Ferret::Store::FSDirectory.
|
600
|
+
fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
|
601
601
|
|
602
602
|
iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
|
603
603
|
docs = IndexTestHelper.prepare_ir_test_docs()
|
@@ -63,7 +63,7 @@ class SegmentInfoTest < Test::Unit::TestCase
|
|
63
63
|
assert_equal(si.name, "seg1")
|
64
64
|
@dir.close()
|
65
65
|
@dpath = File.dirname(__FILE__) + '/../../temp/fsdir'
|
66
|
-
@dir = Ferret::Store::FSDirectory.
|
66
|
+
@dir = Ferret::Store::FSDirectory.new(@dpath, true)
|
67
67
|
si.name = "seg2"
|
68
68
|
si.doc_count += 2
|
69
69
|
si.directory = @dir
|
data/test/unit/index/th_doc.rb
CHANGED
@@ -234,7 +234,7 @@ module IndexTestHelper
|
|
234
234
|
doc.boost = i+1
|
235
235
|
|
236
236
|
fields.each_pair do |field, text|
|
237
|
-
doc << Field.new(field, text, Field::Store::
|
237
|
+
doc << Field.new(field, text, Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::NO, false)
|
238
238
|
end
|
239
239
|
docs << doc
|
240
240
|
end
|
@@ -46,6 +46,12 @@ class IndexSearcherTest < Test::Unit::TestCase
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
+
def test_get_doc()
|
50
|
+
assert_equal(18, @is.max_doc)
|
51
|
+
assert_equal("20050930", @is.doc(0).values(:date))
|
52
|
+
assert_equal("cat1/sub2/subsub2", @is.doc(4)[:cat])
|
53
|
+
end
|
54
|
+
|
49
55
|
def test_term_query
|
50
56
|
tq = TermQuery.new(Term.new("field", "word2"));
|
51
57
|
tq.boost = 100
|
@@ -26,7 +26,7 @@ class FSStoreTest < Test::Unit::TestCase
|
|
26
26
|
def setup
|
27
27
|
@dpath = File.join(File.dirname(__FILE__),
|
28
28
|
'../../temp/fsdir')
|
29
|
-
@dir = FSDirectory.
|
29
|
+
@dir = FSDirectory.new(@dpath, true)
|
30
30
|
end
|
31
31
|
|
32
32
|
def teardown
|
@@ -39,12 +39,12 @@ class FSStoreTest < Test::Unit::TestCase
|
|
39
39
|
'/../../temp/cachetest')
|
40
40
|
assert(! FSDirectory.directory_cache[dir_path],
|
41
41
|
"this directory should not be cached yet")
|
42
|
-
@dir1 = FSDirectory.
|
42
|
+
@dir1 = FSDirectory.new(dir_path, true)
|
43
43
|
assert(FSDirectory.directory_cache[dir_path],
|
44
44
|
"this directory should now be cached")
|
45
45
|
assert_equal(@dir1.ref_count, 1,
|
46
46
|
"There is one reference so the refcount should now be 1")
|
47
|
-
@dir2 = FSDirectory.
|
47
|
+
@dir2 = FSDirectory.new(dir_path, true)
|
48
48
|
assert(@dir1 === @dir2,
|
49
49
|
"The directory should be cached so the same directory object should have been returned")
|
50
50
|
assert_equal(@dir1.ref_count, 2,
|
@@ -4,6 +4,14 @@ require File.dirname(__FILE__) + "/../../test_helper"
|
|
4
4
|
class BitVectorTest < Test::Unit::TestCase
|
5
5
|
include Ferret::Utils
|
6
6
|
|
7
|
+
def test_bignum_conversion()
|
8
|
+
j = 256
|
9
|
+
10.times do
|
10
|
+
j *= j
|
11
|
+
assert_equal(j, BitVector.string_to_bignum(BitVector.bignum_to_string(j)))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
7
15
|
def test_bv()
|
8
16
|
bv = BitVector.new
|
9
17
|
assert_equal(0, bv.count)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../test_helper"
|
2
|
+
require 'thread'
|
3
|
+
|
4
|
+
|
5
|
+
class ThreadTest < Test::Unit::TestCase
|
6
|
+
include Ferret::Utils
|
7
|
+
|
8
|
+
NUM_THREADS = 100
|
9
|
+
|
10
|
+
def test_basic_get_and_set()
|
11
|
+
Thread.current.clear_local
|
12
|
+
b = "hello"
|
13
|
+
Thread.current.set_local(b, "dave")
|
14
|
+
assert_equal("dave", Thread.current.get_local(b))
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_objects_die
|
18
|
+
Thread.current.clear_local
|
19
|
+
a = []
|
20
|
+
10.times {|i| a[i] = "#{i}"; Thread.current.set_local(a[i], i) }
|
21
|
+
10.times {|i| assert_equal(i, Thread.current.get_local(a[i])) }
|
22
|
+
assert_equal(10, Thread.current.local_size)
|
23
|
+
GC.start
|
24
|
+
assert_equal(10, Thread.current.local_size)
|
25
|
+
10.times {|i| a[i] = nil; }
|
26
|
+
#puts w
|
27
|
+
|
28
|
+
# this is a hack to get the GC to collect the last ref created above
|
29
|
+
x = WeakKeyHash.new()
|
30
|
+
10.times {|i| a[i] = "#{i}"; x[a[i]] = i }
|
31
|
+
|
32
|
+
assert_equal(10, Thread.current.local_size)
|
33
|
+
GC.start
|
34
|
+
assert(0, Thread.current.local_size)
|
35
|
+
end
|
36
|
+
|
37
|
+
class ThreadTester
|
38
|
+
def initialize(val)
|
39
|
+
Thread.current.set_local(self, val)
|
40
|
+
end
|
41
|
+
def inc
|
42
|
+
val = Thread.current.get_local(self) + 1
|
43
|
+
Thread.current.set_local(self, val)
|
44
|
+
return val
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def single_thread
|
49
|
+
tt = ThreadTester.new(start = rand(10000000))
|
50
|
+
((start+1)..start+11).each {|i| assert_equal(i, tt.inc) }
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_threads_dont_share
|
54
|
+
threads = []
|
55
|
+
NUM_THREADS.times do
|
56
|
+
threads << Thread.new { single_thread }
|
57
|
+
end
|
58
|
+
|
59
|
+
threads.each {|t| t.join}
|
60
|
+
end
|
61
|
+
end
|
@@ -4,11 +4,12 @@ require File.dirname(__FILE__) + "/../../test_helper"
|
|
4
4
|
class WeakKeyHashTest < Test::Unit::TestCase
|
5
5
|
include Ferret::Utils
|
6
6
|
|
7
|
-
def
|
7
|
+
def test_objects_are_destroyed()
|
8
8
|
w = WeakKeyHash.new()
|
9
9
|
a = []
|
10
10
|
10.times {|i| a[i] = "#{i}"; w[a[i]] = i }
|
11
11
|
10.times {|i| assert_equal(i, w[a[i]]) }
|
12
|
+
assert_equal(10, w.size)
|
12
13
|
10.times {|i| a[i] = nil; }
|
13
14
|
#puts w
|
14
15
|
|
@@ -16,7 +17,6 @@ class WeakKeyHashTest < Test::Unit::TestCase
|
|
16
17
|
x = WeakKeyHash.new()
|
17
18
|
10.times {|i| a[i] = "#{i}"; x[a[i]] = i }
|
18
19
|
|
19
|
-
assert_equal(10, w.size)
|
20
20
|
GC.start
|
21
21
|
#puts w.size
|
22
22
|
#puts w
|
@@ -0,0 +1,132 @@
|
|
1
|
+
# Author: Matthew D Moss
|
2
|
+
#
|
3
|
+
# Writtern for ruby quiz #25
|
4
|
+
#
|
5
|
+
class JapaneseTranslator
|
6
|
+
# My knowledge of counting Japanese is limited, so this may not
|
7
|
+
# be entirely correct; in particular, I don't know what rules
|
8
|
+
# to follow after 'hyaku man' (1,000,000).
|
9
|
+
# I also combine a digit with its group, such as 'gohyaku' rather
|
10
|
+
# than 'go hyaku'; I just like reading it better that way.
|
11
|
+
|
12
|
+
DIGITS = %w(zero ichi ni san yon go roku nana hachi kyu)
|
13
|
+
GROUPS = %w(nothingtoseeheremovealong ju hyaku sen)
|
14
|
+
MAN = 10000
|
15
|
+
|
16
|
+
def to_spoken(val)
|
17
|
+
case val <=> 0
|
18
|
+
when -1
|
19
|
+
'- ' + to_spoken(-val)
|
20
|
+
when 0
|
21
|
+
DIGITS[0]
|
22
|
+
else
|
23
|
+
group(val, 0)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def group(val, level)
|
30
|
+
if val >= MAN
|
31
|
+
group(val / MAN, 0) + 'man ' + group(val % MAN, 0)
|
32
|
+
else
|
33
|
+
case val
|
34
|
+
when 0
|
35
|
+
''
|
36
|
+
when 1
|
37
|
+
level == 0 ? DIGITS[val] : GROUPS[level]
|
38
|
+
when 2...10
|
39
|
+
DIGITS[val] + (GROUPS[level] if level > 0).to_s
|
40
|
+
else
|
41
|
+
group(val / 10, level+1) + ' ' + group(val % 10, level)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
class USEnglishTranslator
|
49
|
+
# Formal, US English. Optional 'and'. Will not produce things
|
50
|
+
# such as 'twelve hundred' but rather 'one thousand two hundred'.
|
51
|
+
# The use of 'and' is incomplete; it is sometimes missed.
|
52
|
+
|
53
|
+
DIGITS = %w(zero one two three four five six seven eight nine)
|
54
|
+
TEENS = %w(ten eleven twelve thirteen fourteen fifteen sixteen
|
55
|
+
seventeen eighteen nineteen)
|
56
|
+
TENS = %w(hello world twenty thirty forty fifty sixty seventy
|
57
|
+
eighty ninety)
|
58
|
+
GROUPS = %w(thousand million billion trillion quadrillion
|
59
|
+
quintillion sextillion septillion octillion nonillion
|
60
|
+
decillion)
|
61
|
+
K = 1000
|
62
|
+
|
63
|
+
def initialize(conjunction = true)
|
64
|
+
@conjunction = conjunction
|
65
|
+
end
|
66
|
+
|
67
|
+
def to_spoken(val)
|
68
|
+
case val <=> 0
|
69
|
+
when -1
|
70
|
+
'negative ' + to_spoken(-val)
|
71
|
+
when 0
|
72
|
+
DIGITS[0]
|
73
|
+
else
|
74
|
+
group(val, 0).flatten.join(' ')
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def group(val, level)
|
81
|
+
x = group(val / K, level + 1) << GROUPS[level] if val >= K
|
82
|
+
x.to_a << under_1000(val % K, level)
|
83
|
+
end
|
84
|
+
|
85
|
+
def under_1000(val, level)
|
86
|
+
x = [DIGITS[val / 100]] << 'hundred' if val >= 100
|
87
|
+
x.to_a << under_100(val % 100, (level == 0 and not x.nil?))
|
88
|
+
end
|
89
|
+
|
90
|
+
def under_100(val, junction)
|
91
|
+
x = [('and' if @conjunction and junction)] # wyf?
|
92
|
+
case val
|
93
|
+
when 0
|
94
|
+
[]
|
95
|
+
when 1...10
|
96
|
+
x << DIGITS[val]
|
97
|
+
when 10...20
|
98
|
+
x << TEENS[val - 10]
|
99
|
+
else
|
100
|
+
d = val % 10
|
101
|
+
x << (TENS[val / 10] + ('-' + DIGITS[d] if d != 0).to_s)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
class Integer
|
108
|
+
def to_spoken(translator = USEnglishTranslator.new)
|
109
|
+
translator.to_spoken(self).squeeze(' ').strip
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
if $0 == __FILE__
|
114
|
+
SAMPLES = [ 0, 1, 2, 5, 10, 11, 14, 18, 20, 21, 29, 33, 42, 50, 87, 99,
|
115
|
+
100, 101, 110, 167, 199, 200, 201, 276, 300, 314, 500, 610,
|
116
|
+
1000, 1039, 1347, 2309, 3098, 23501, 32767, 70000, 5480283,
|
117
|
+
2435489238, 234100090000, -42, -2001 ]
|
118
|
+
|
119
|
+
TRANSLATORS = { 'US English' => USEnglishTranslator.new,
|
120
|
+
'Japanese' => JapaneseTranslator.new }
|
121
|
+
|
122
|
+
|
123
|
+
# main
|
124
|
+
TRANSLATORS.each do |lang, translator|
|
125
|
+
puts
|
126
|
+
puts lang
|
127
|
+
puts '-' * lang.length
|
128
|
+
SAMPLES.each do |val|
|
129
|
+
puts "%12d => %s" % [val, val.to_spoken(translator)]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ferret
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date: 2005-
|
6
|
+
version: 0.1.4
|
7
|
+
date: 2005-11-01 00:00:00 +09:00
|
8
8
|
summary: Ruby indexing library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -164,6 +164,7 @@ files:
|
|
164
164
|
- lib/ferret/utils/string_helper.rb
|
165
165
|
- lib/ferret/utils/number_tools.rb
|
166
166
|
- lib/ferret/utils/date_tools.rb
|
167
|
+
- lib/ferret/utils/thread_local.rb
|
167
168
|
- test/test_helper.rb
|
168
169
|
- test/test_all.rb
|
169
170
|
- test/unit/ts_document.rb
|
@@ -180,6 +181,7 @@ files:
|
|
180
181
|
- test/unit/utils/tc_date_tools.rb
|
181
182
|
- test/unit/utils/tc_parameter.rb
|
182
183
|
- test/unit/utils/tc_weak_key_hash.rb
|
184
|
+
- test/unit/utils/tc_thread.rb
|
183
185
|
- test/unit/analysis/tc_lower_case_tokenizer.rb
|
184
186
|
- test/unit/analysis/tc_lower_case_filter.rb
|
185
187
|
- test/unit/analysis/tc_porter_stem_filter.rb
|
@@ -230,6 +232,9 @@ files:
|
|
230
232
|
- test/longrunning/tm_store.rb
|
231
233
|
- test/benchmark/tb_rw_vint.rb
|
232
234
|
- test/benchmark/tb_ram_store.rb
|
235
|
+
- test/functional/thread_safety_index_test.rb
|
236
|
+
- test/functional/thread_safety_test.rb
|
237
|
+
- test/utils/number_to_spoken.rb
|
233
238
|
- test/unit/analysis/data/wordfile
|
234
239
|
- rake_utils/code_statistics.rb
|
235
240
|
test_files: []
|