fastri 0.1.1.1 → 0.2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +12 -0
- data/Rakefile +5 -4
- data/bin/fastri-server +77 -35
- data/bin/fri +145 -8
- data/bin/ri-emacs +1 -1
- data/lib/fastri/full_text_index.rb +245 -0
- data/lib/fastri/full_text_indexer.rb +100 -0
- data/lib/fastri/ri_index.rb +30 -0
- data/lib/fastri/ri_service.rb +6 -0
- data/lib/fastri/util.rb +83 -0
- data/lib/fastri/version.rb +6 -1
- data/test/test_full_text_index.rb +182 -0
- data/test/test_full_text_indexer.rb +84 -0
- data/test/test_integration_full_text_index.rb +43 -0
- data/test/test_ri_index.rb +99 -1
- data/test/test_util.rb +38 -0
- metadata +14 -3
@@ -0,0 +1,100 @@
|
|
1
|
+
# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
|
2
|
+
#
|
3
|
+
|
4
|
+
require 'fastri/version'
|
5
|
+
|
6
|
+
module FastRI
|
7
|
+
|
8
|
+
class FullTextIndexer
|
9
|
+
WORD_RE = /[A-Za-z0-9_]+/
|
10
|
+
NONWORD_RE = /[^A-Za-z0-9_]+/
|
11
|
+
MAGIC = "FastRI full-text index #{FASTRI_FT_INDEX_FORMAT}\0"
|
12
|
+
|
13
|
+
def initialize(max_querysize)
|
14
|
+
@documents = []
|
15
|
+
@doc_hash = {}
|
16
|
+
@max_wordsize = max_querysize
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_document(name, data, metadata = {})
|
20
|
+
@doc_hash[name] = [data, metadata]
|
21
|
+
@documents << name
|
22
|
+
end
|
23
|
+
|
24
|
+
def data(name)
|
25
|
+
@doc_hash[name][0]
|
26
|
+
end
|
27
|
+
|
28
|
+
def documents
|
29
|
+
@documents = @documents.uniq
|
30
|
+
end
|
31
|
+
|
32
|
+
def preprocess(str)
|
33
|
+
str.gsub(/\0/,"")
|
34
|
+
end
|
35
|
+
|
36
|
+
require 'strscan'
|
37
|
+
def find_suffixes(text, offset)
|
38
|
+
find_suffixes_simple(text, WORD_RE, NONWORD_RE, offset)
|
39
|
+
end
|
40
|
+
|
41
|
+
def find_suffixes_simple(string, word_re, nonword_re, offset)
|
42
|
+
suffixes = []
|
43
|
+
sc = StringScanner.new(string)
|
44
|
+
until sc.eos?
|
45
|
+
sc.skip(nonword_re)
|
46
|
+
len = string.size
|
47
|
+
loop do
|
48
|
+
break if sc.pos == len
|
49
|
+
suffixes << offset + sc.pos
|
50
|
+
skipped_word = sc.skip(word_re)
|
51
|
+
break unless skipped_word
|
52
|
+
loop do
|
53
|
+
skipped_nonword = sc.skip(nonword_re)
|
54
|
+
break unless skipped_nonword
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
suffixes
|
59
|
+
end
|
60
|
+
|
61
|
+
require 'enumerator'
|
62
|
+
def build_index(full_text_IO, suffix_array_IO)
|
63
|
+
fulltext = ""
|
64
|
+
io = StringIO.new(fulltext)
|
65
|
+
io.write MAGIC
|
66
|
+
full_text_IO.write MAGIC
|
67
|
+
documents.each do |doc|
|
68
|
+
data, metadata = @doc_hash[doc]
|
69
|
+
io.write(data)
|
70
|
+
full_text_IO.write(data)
|
71
|
+
meta_txt = Marshal.dump(metadata)
|
72
|
+
footer = "\0....#{doc}\0#{meta_txt}\0"
|
73
|
+
footer[1,4] = [footer.size - 5].pack("V")
|
74
|
+
io.write(footer)
|
75
|
+
full_text_IO.write(footer)
|
76
|
+
end
|
77
|
+
|
78
|
+
scanner = StringScanner.new(fulltext)
|
79
|
+
scanner.scan(Regexp.new(Regexp.escape(MAGIC)))
|
80
|
+
|
81
|
+
count = 0
|
82
|
+
suffixes = []
|
83
|
+
until scanner.eos?
|
84
|
+
count += 1
|
85
|
+
start = scanner.pos
|
86
|
+
text = scanner.scan_until(/\0/)
|
87
|
+
suffixes.concat find_suffixes(text[0..-2], start)
|
88
|
+
len = scanner.scan(/..../).unpack("V")[0]
|
89
|
+
#puts "LEN: #{len} #{scanner.pos} #{scanner.string.size}"
|
90
|
+
#puts "#{scanner.string[scanner.pos,20].inspect}"
|
91
|
+
scanner.pos += len
|
92
|
+
#scanner.terminate if !text
|
93
|
+
end
|
94
|
+
sorted = suffixes.sort_by{|x| fulltext[x, @max_wordsize]}
|
95
|
+
sorted.each_slice(10000){|x| suffix_array_IO.write x.pack("V*")}
|
96
|
+
nil
|
97
|
+
end
|
98
|
+
end # class FullTextIndexer
|
99
|
+
|
100
|
+
end # module FastRI
|
data/lib/fastri/ri_index.rb
CHANGED
@@ -3,9 +3,37 @@
|
|
3
3
|
|
4
4
|
require 'rdoc/ri/ri_cache'
|
5
5
|
require 'rdoc/ri/ri_reader'
|
6
|
+
require 'rdoc/ri/ri_descriptions'
|
6
7
|
require 'fastri/version'
|
7
8
|
|
8
9
|
|
10
|
+
# This is taken straight from 1.8.5's rdoc/ri/ri_descriptions.rb.
|
11
|
+
# Older releases have a buggy #merge_in that crashes when old.comment is nil.
|
12
|
+
if RUBY_RELEASE_DATE < "2006-06-15"
|
13
|
+
module ::RI # :nodoc:
|
14
|
+
class ModuleDescription # :nodoc:
|
15
|
+
remove_method :merge_in
|
16
|
+
# merge in another class desscription into this one
|
17
|
+
def merge_in(old)
|
18
|
+
merge(@class_methods, old.class_methods)
|
19
|
+
merge(@instance_methods, old.instance_methods)
|
20
|
+
merge(@attributes, old.attributes)
|
21
|
+
merge(@constants, old.constants)
|
22
|
+
merge(@includes, old.includes)
|
23
|
+
if @comment.nil? || @comment.empty?
|
24
|
+
@comment = old.comment
|
25
|
+
else
|
26
|
+
unless old.comment.nil? or old.comment.empty? then
|
27
|
+
@comment << SM::Flow::RULE.new
|
28
|
+
@comment.concat old.comment
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
9
37
|
module FastRI
|
10
38
|
|
11
39
|
# This class provides the same functionality as RiReader, with some
|
@@ -466,6 +494,8 @@ class RiIndex
|
|
466
494
|
when /[#.]\S+/
|
467
495
|
method_entry = get_entry(@method_array, entry_or_name, MethodEntry, nil)
|
468
496
|
source_paths_for(method_entry)
|
497
|
+
when ""
|
498
|
+
[]
|
469
499
|
else
|
470
500
|
class_entry = get_entry(@namespace_array, entry_or_name, ClassEntry, nil)
|
471
501
|
source_paths_for(class_entry)
|
data/lib/fastri/ri_service.rb
CHANGED
@@ -273,6 +273,12 @@ class RiService
|
|
273
273
|
m.add_matcher(:partial_ci) do
|
274
274
|
m.yield @ri_reader.methods_under_matching("", /#{sep_re}#{name}/i, true)
|
275
275
|
end
|
276
|
+
m.add_matcher(:anywhere) do
|
277
|
+
m.yield @ri_reader.methods_under_matching("", /#{sep_re}.*#{name}/, true)
|
278
|
+
end
|
279
|
+
m.add_matcher(:anywhere_ci) do
|
280
|
+
m.yield @ri_reader.methods_under_matching("", /#{sep_re}.*#{name}/i, true)
|
281
|
+
end
|
276
282
|
end
|
277
283
|
matcher.get_matches(order)
|
278
284
|
end
|
data/lib/fastri/util.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
|
2
|
+
|
3
|
+
require 'rdoc/ri/ri_paths'
|
4
|
+
begin
|
5
|
+
require 'rubygems'
|
6
|
+
rescue LoadError
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'rdoc/ri/ri_writer'
|
10
|
+
|
11
|
+
module FastRI
|
12
|
+
module Util
|
13
|
+
# Return an array of <tt>[name, version, path]</tt> arrays corresponding to
|
14
|
+
# the last version of each installed gem. +path+ is the base path of the RI
|
15
|
+
# documentation from the gem. If the version cannot be determined, it will
|
16
|
+
# be +nil+, and the corresponding gem might be repeated in the output array
|
17
|
+
# (once per version).
|
18
|
+
def gem_directories_unique
|
19
|
+
return [] unless defined? Gem
|
20
|
+
gemdirs = Dir["#{Gem.path}/doc/*/ri"]
|
21
|
+
gems = Hash.new{|h,k| h[k] = []}
|
22
|
+
gemdirs.each do |path|
|
23
|
+
gemname, version = %r{/([^/]+)-(.*)/ri$}.match(path).captures
|
24
|
+
if gemname.nil? # doesn't follow any conventions :(
|
25
|
+
gems[path[%r{/([^/]+)/ri$}, 1]] << [nil, path]
|
26
|
+
else
|
27
|
+
gems[gemname] << [version, path]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
gems.sort_by{|name, _| name}.map do |name, versions|
|
31
|
+
version, path = versions.sort.last
|
32
|
+
[name, version, File.expand_path(path)]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
module_function :gem_directories_unique
|
36
|
+
|
37
|
+
# Return the <tt>[name, version, path]</tt> array for the gem owning the RI
|
38
|
+
# information stored in +path+, or +nil+.
|
39
|
+
def gem_info_for_path(path, gem_dir_info = FastRI::Util.gem_directories_unique)
|
40
|
+
path = File.expand_path(path)
|
41
|
+
matches = gem_dir_info.select{|name, version, gem_path| path.index(gem_path) == 0}
|
42
|
+
matches.sort_by{|name, version, gem_path| [gem_path.size, version, name]}.last
|
43
|
+
end
|
44
|
+
module_function :gem_info_for_path
|
45
|
+
|
46
|
+
# Return the +full_name+ (in ClassEntry or MethodEntry's sense) given a path
|
47
|
+
# to a .yaml file relative to a "base RI DB path".
|
48
|
+
def gem_relpath_to_full_name(relpath)
|
49
|
+
case relpath
|
50
|
+
when %r{^(.*)/cdesc-([^/]*)\.yaml$}
|
51
|
+
path, name = $~.captures
|
52
|
+
(path.split(%r{/})[0..-2] << name).join("::")
|
53
|
+
when %r{^(.*)/([^/]*)-(i|c)\.yaml$}
|
54
|
+
path, escaped_name, type = $~.captures
|
55
|
+
name = RI::RiWriter.external_to_internal(escaped_name)
|
56
|
+
sep = ( type == 'c' ) ? "." : "#"
|
57
|
+
path.gsub("/", "::") + sep + name
|
58
|
+
end
|
59
|
+
end
|
60
|
+
module_function :gem_relpath_to_full_name
|
61
|
+
|
62
|
+
# Returns the home directory (win32-aware).
|
63
|
+
def find_home
|
64
|
+
# stolen from RubyGems
|
65
|
+
['HOME', 'USERPROFILE'].each do |homekey|
|
66
|
+
return ENV[homekey] if ENV[homekey]
|
67
|
+
end
|
68
|
+
if ENV['HOMEDRIVE'] && ENV['HOMEPATH']
|
69
|
+
return "#{ENV['HOMEDRIVE']}:#{ENV['HOMEPATH']}"
|
70
|
+
end
|
71
|
+
begin
|
72
|
+
File.expand_path("~")
|
73
|
+
rescue StandardError => ex
|
74
|
+
if File::ALT_SEPARATOR
|
75
|
+
"C:/"
|
76
|
+
else
|
77
|
+
"/"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
module_function :find_home
|
82
|
+
end # module Util
|
83
|
+
end # module FastRI
|
data/lib/fastri/version.rb
CHANGED
@@ -2,7 +2,12 @@
|
|
2
2
|
#
|
3
3
|
|
4
4
|
module FastRI
|
5
|
-
FASTRI_VERSION = "0.
|
5
|
+
FASTRI_VERSION = "0.2.0"
|
6
|
+
FASTRI_RELEASE_DATE = "2006-11-15"
|
6
7
|
FASTRI_INDEX_FORMAT = "0.1.0"
|
8
|
+
FASTRI_FT_INDEX_FORMAT = "0.0.0"
|
9
|
+
FASTRI_FT_INDEX_FORMAT_MAJOR = "0"
|
10
|
+
FASTRI_FT_INDEX_FORMAT_MINOR = "0"
|
11
|
+
FASTRI_FT_INDEX_FORMAT_TEENY = "0"
|
7
12
|
end
|
8
13
|
# vi: set sw=2 expandtab:
|
@@ -0,0 +1,182 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
3
|
+
$:.unshift "lib"
|
4
|
+
require 'fastri/full_text_index'
|
5
|
+
|
6
|
+
class TestFullTextIndex < Test::Unit::TestCase
|
7
|
+
require 'stringio'
|
8
|
+
include FastRI
|
9
|
+
|
10
|
+
magic = FullTextIndexer::MAGIC
|
11
|
+
data = <<EOF
|
12
|
+
#{magic}this is a test
|
13
|
+
\r\000\000\000foo.txt\000\004\b{\000
|
14
|
+
zzzz
|
15
|
+
\r\000\000\000bar.txt\000\004\b{\000
|
16
|
+
EOF
|
17
|
+
DATA = (data.split(/\n/) << "").join("\0")
|
18
|
+
SUFFIXES = %w[a\ test is\ a test this zzzz].map{|w| [DATA.index(w)].pack("V")}.join("")
|
19
|
+
|
20
|
+
data = <<EOF
|
21
|
+
#{magic}this is a test
|
22
|
+
\r\000\000\000foo.txt\000\004\b{\000
|
23
|
+
zzzz this
|
24
|
+
\r\000\000\000bar.txt\000\004\b{\000
|
25
|
+
EOF
|
26
|
+
DATA2 = (data.split(/\n/) << "").join("\0")
|
27
|
+
SUFFIXES2 = ["a test", "is a", "test", "this\0", "this", "zzzz"].map{|x| [DATA2.index(x)].pack("V")}.join("")
|
28
|
+
|
29
|
+
data = <<EOF
|
30
|
+
#{magic}this is a test
|
31
|
+
SIZ1foo.txt\000#{Marshal.dump({:foo => :bar, :bar => 1})}
|
32
|
+
zzzz this
|
33
|
+
SIZ2bar.txt\000#{Marshal.dump({:foo => :baz, :bar => 42})}
|
34
|
+
EOF
|
35
|
+
lines = data.split(/\n/)
|
36
|
+
len1 = lines[1].size - 4 + 1
|
37
|
+
lines[1].sub!(/SIZ1/, [len1].pack("V"))
|
38
|
+
len2 = lines[3].size - 4 + 1
|
39
|
+
lines[3].sub!(/SIZ2/, [len2].pack("V"))
|
40
|
+
DATA3 = (lines << "").join("\0")
|
41
|
+
SUFFIXES3 = ["a test", "is a", "test", "this\0", "this", "zzzz"].map{|x| [DATA3.index(x)].pack("V")}.join("")
|
42
|
+
|
43
|
+
def setup
|
44
|
+
@index = FullTextIndex.new_from_ios(StringIO.new(DATA), StringIO.new(SUFFIXES))
|
45
|
+
@index2 = FullTextIndex.new_from_ios(StringIO.new(DATA2), StringIO.new(SUFFIXES2))
|
46
|
+
@index3 = FullTextIndex.new_from_ios(StringIO.new(DATA3), StringIO.new(SUFFIXES3))
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_new_from_ios
|
50
|
+
a = nil
|
51
|
+
assert_nothing_raised { a = FullTextIndex.new_from_ios(StringIO.new(DATA), StringIO.new(SUFFIXES)) }
|
52
|
+
assert_equal(FullTextIndex::DEFAULT_OPTIONS[:max_query_size], a.max_query_size)
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_lookup_basic
|
56
|
+
%w[this is a test].each do |term|
|
57
|
+
result = @index.lookup(term)
|
58
|
+
assert_kind_of(FullTextIndex::Result, result)
|
59
|
+
assert_equal(term, result.query)
|
60
|
+
assert_equal("foo.txt", result.path)
|
61
|
+
end
|
62
|
+
assert_equal(0, @index.lookup("a").index)
|
63
|
+
assert_equal(2, @index.lookup("t").index)
|
64
|
+
assert_equal(3, @index.lookup("th").index)
|
65
|
+
|
66
|
+
assert_equal(4, @index.lookup("z").index)
|
67
|
+
assert_equal("bar.txt", @index.lookup("z").path)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_lookup_metadata
|
71
|
+
assert_equal({}, @index.lookup("test").metadata)
|
72
|
+
assert_equal({}, @index.lookup("zzzz").metadata)
|
73
|
+
assert_equal({:foo => :bar, :bar => 1}, @index3.lookup("test").metadata)
|
74
|
+
assert_equal({:foo => :baz, :bar => 42}, @index3.lookup("zzz").metadata)
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_Result_text
|
78
|
+
assert_equal("t", @index.lookup("this").text(1))
|
79
|
+
assert_equal("this", @index.lookup("this").text(4))
|
80
|
+
assert_equal("this is a ", @index.lookup("this").text(10))
|
81
|
+
assert_equal("this is a test ", @index.lookup("th").text(100))
|
82
|
+
|
83
|
+
assert_equal("test ", @index.lookup("t").text(10))
|
84
|
+
assert_equal("test ", @index.lookup("t").text(20))
|
85
|
+
|
86
|
+
assert_equal("z", @index.lookup("z").text(1))
|
87
|
+
assert_equal("zzzz", @index.lookup("z").text(10))
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_Result_context
|
91
|
+
assert_equal(" a ", @index.lookup("a").context(1))
|
92
|
+
assert_equal("s a t", @index.lookup("a").context(2))
|
93
|
+
assert_equal("is a te", @index.lookup("a").context(3))
|
94
|
+
assert_equal("s is a test", @index.lookup("a").context(5))
|
95
|
+
assert_equal("this is a test ", @index.lookup("a").context(10))
|
96
|
+
end
|
97
|
+
|
98
|
+
def test_Result_context_non_initial_entry
|
99
|
+
assert_equal("zz", @index.lookup("z").context(1))
|
100
|
+
assert_equal("zzz", @index.lookup("z").context(2))
|
101
|
+
assert_equal("zzzz", @index.lookup("z").context(3))
|
102
|
+
assert_equal("zzzz", @index.lookup("z").context(4))
|
103
|
+
assert_equal("zzzz", @index.lookup("z").context(10))
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_lookup_nonexistent
|
107
|
+
assert_nil(@index.lookup("bogus"))
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_next_match_basic
|
111
|
+
first = @index2.lookup("t")
|
112
|
+
assert_equal("foo.txt", first.path)
|
113
|
+
assert_equal(2, first.index)
|
114
|
+
assert_equal("test ", first.text(10))
|
115
|
+
|
116
|
+
second = @index2.next_match(first)
|
117
|
+
assert_equal("bar.txt", second.path)
|
118
|
+
assert_equal(3, second.index)
|
119
|
+
assert_equal("this", second.text(10))
|
120
|
+
|
121
|
+
third = @index2.next_match(second)
|
122
|
+
assert_kind_of(FullTextIndex::Result, third)
|
123
|
+
assert_equal(4, third.index)
|
124
|
+
assert_equal("this is a ", third.text(10))
|
125
|
+
|
126
|
+
assert_nil(@index2.next_match(third))
|
127
|
+
end
|
128
|
+
|
129
|
+
def test_next_match_restricted
|
130
|
+
first = @index2.lookup("t")
|
131
|
+
assert_equal("foo.txt", first.path)
|
132
|
+
assert_equal(2, first.index)
|
133
|
+
assert_equal("test ", first.text(10))
|
134
|
+
|
135
|
+
second = @index2.next_match(first, "this is")
|
136
|
+
assert_equal("foo.txt", second.path)
|
137
|
+
assert_equal(4, second.index)
|
138
|
+
assert_equal("this is a ", second.text(10))
|
139
|
+
|
140
|
+
assert_nil(@index2.next_match(first, "foo"))
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_next_match_regexp
|
144
|
+
first = @index2.lookup("t")
|
145
|
+
assert_equal("foo.txt", first.path)
|
146
|
+
assert_equal(2, first.index)
|
147
|
+
assert_equal("test ", first.text(10))
|
148
|
+
|
149
|
+
second = @index2.next_match(first, /.*test/)
|
150
|
+
assert_equal("foo.txt", second.path)
|
151
|
+
assert_equal(4, second.index)
|
152
|
+
assert_equal("this is a test ", second.text(20))
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
def test_next_matches
|
157
|
+
first = @index2.lookup("t")
|
158
|
+
all = [first] + @index2.next_matches(first)
|
159
|
+
assert_equal([2, 3, 4], all.map{|x| x.index})
|
160
|
+
assert_equal(["foo.txt", "bar.txt", "foo.txt"], all.map{|x| x.path})
|
161
|
+
one, two, three = *all
|
162
|
+
assert_equal(["test ", "this", "this is a test "], all.map{|x| x.text(20)})
|
163
|
+
end
|
164
|
+
|
165
|
+
def test_next_matches_restricted
|
166
|
+
first = @index2.lookup("t")
|
167
|
+
assert_equal([], @index2.next_matches(first, "this is not"))
|
168
|
+
all = @index2.next_matches(first, "this is")
|
169
|
+
assert_equal(["foo.txt"], all.map{|x| x.path})
|
170
|
+
assert_equal([4], all.map{|x| x.index})
|
171
|
+
assert_equal(["this is a test "], all.map{|x| x.text(20)})
|
172
|
+
end
|
173
|
+
|
174
|
+
def test_next_matches_regexp
|
175
|
+
first = @index2.lookup("t")
|
176
|
+
all = @index2.next_matches(first, /.*test/)
|
177
|
+
assert_equal(["foo.txt"], all.map{|x| x.path})
|
178
|
+
assert_equal([4], all.map{|x| x.index})
|
179
|
+
assert_equal(["this is a test "], all.map{|x| x.text(20)})
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
3
|
+
$:.unshift "lib"
|
4
|
+
require 'fastri/full_text_indexer'
|
5
|
+
|
6
|
+
class TestFullTextIndexer < Test::Unit::TestCase
|
7
|
+
require 'stringio'
|
8
|
+
include FastRI
|
9
|
+
def setup
|
10
|
+
@indexer = FullTextIndexer.new(20)
|
11
|
+
end
|
12
|
+
|
13
|
+
DATA1 = "this is a test " * 1000
|
14
|
+
DATA2 = "this is another test " * 1000
|
15
|
+
def test_add_document
|
16
|
+
@indexer.add_document("foo.txt", DATA1)
|
17
|
+
assert_equal(["foo.txt"], @indexer.documents)
|
18
|
+
assert_equal(DATA1, @indexer.data("foo.txt"))
|
19
|
+
@indexer.add_document("foo.txt", DATA2)
|
20
|
+
assert_equal(["foo.txt"], @indexer.documents)
|
21
|
+
assert_equal(DATA2, @indexer.data("foo.txt"))
|
22
|
+
@indexer.add_document("bar.txt", DATA2)
|
23
|
+
assert_equal(["foo.txt", "bar.txt"], @indexer.documents)
|
24
|
+
assert_equal(DATA2, @indexer.data("bar.txt"))
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_preprocess
|
28
|
+
data = "this is a \0foo bar\0 bla"
|
29
|
+
assert_equal("this is a foo bar bla", @indexer.preprocess(data))
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_find_suffixes_simple
|
33
|
+
data = <<EOF
|
34
|
+
this is a simple test with these words: Aaaa01 0.1 _asdA1
|
35
|
+
EOF
|
36
|
+
assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 47, 49, 51],
|
37
|
+
@indexer.find_suffixes_simple(data, /[A-Za-z0-9_]+/, /[^A-Za-z0-9_]+/,0))
|
38
|
+
assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 52],
|
39
|
+
@indexer.find_suffixes_simple(data, /[A-Za-z]+/, /[^A-Za-z]+/, 0))
|
40
|
+
assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 52].map{|x| x+10},
|
41
|
+
@indexer.find_suffixes_simple(data, /[A-Za-z]+/, /[^A-Za-z]+/, 10))
|
42
|
+
assert_equal([0, 1, 2, 3, 5, 6, 8, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20,
|
43
|
+
22, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37,
|
44
|
+
|
45
|
+
40, 41, 42, 43, 52, 53, 54, 55],
|
46
|
+
@indexer.find_suffixes_simple(data, /[A-Za-z]/, /[^A-Za-z]+/, 0))
|
47
|
+
assert_equal([0, 5], @indexer.find_suffixes_simple("abcd\ndefg", /\S+/, /\s+/, 0))
|
48
|
+
assert_equal([1, 6], @indexer.find_suffixes_simple("abcd\ndefg", /\S+/, /\s+/, 1))
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_build_index_trivial
|
52
|
+
@indexer.add_document("foo.txt", DATA1)
|
53
|
+
fulltext = StringIO.new("")
|
54
|
+
suffixarray = StringIO.new("")
|
55
|
+
@indexer.build_index(fulltext, suffixarray)
|
56
|
+
assert_equal(["\000\r\000\000\000foo.txt\000\004\b{\000\000"],
|
57
|
+
fulltext.string[-200..-1].scan(/\0.*$/))
|
58
|
+
assert_equal(4000 * 4, suffixarray.string.size)
|
59
|
+
end
|
60
|
+
|
61
|
+
def build_index_test_helper(data, suffixes)
|
62
|
+
@indexer.add_document("foo.txt", data)
|
63
|
+
offset = FullTextIndexer::MAGIC.size
|
64
|
+
suffixes = suffixes.map{|x| x + offset}
|
65
|
+
sorted = suffixes.sort_by{|i| data[i - offset]}
|
66
|
+
f_io = StringIO.new("")
|
67
|
+
sa_io = StringIO.new("")
|
68
|
+
@indexer.build_index(f_io, sa_io)
|
69
|
+
assert_equal(sorted, sa_io.string.scan(/..../m).map{|x| x.unpack("V")[0]})
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_build_index_harder
|
73
|
+
data = <<EOF
|
74
|
+
a bcd efghi jklmn opqrst
|
75
|
+
EOF
|
76
|
+
suffixes = [0, 2, 6, 12, 18]
|
77
|
+
build_index_test_helper(data, suffixes)
|
78
|
+
data = <<EOF
|
79
|
+
e xcd afghi zklmn bpqrst
|
80
|
+
EOF
|
81
|
+
suffixes = [0, 2, 6, 12, 18]
|
82
|
+
build_index_test_helper(data, suffixes)
|
83
|
+
end
|
84
|
+
end
|