rsi 0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +25 -0
- data/Makefile +24 -0
- data/Manifest +30 -0
- data/README +49 -0
- data/TODO +30 -0
- data/bin/rsi_search.rb +50 -0
- data/bin/search_bench.rb +47 -0
- data/docs/ATTRIB +14 -0
- data/docs/Changes +25 -0
- data/docs/Roadmap +41 -0
- data/lib/rsi.rb +40 -0
- data/lib/rsi/analysis.rb +79 -0
- data/lib/rsi/compressed_serializers.rb +60 -0
- data/lib/rsi/dictionary.rb +232 -0
- data/lib/rsi/index.rb +245 -0
- data/lib/rsi/logmanager.rb +105 -0
- data/lib/rsi/porter.rb +213 -0
- data/lib/rsi/query.rb +98 -0
- data/lib/rsi/rsi_intro.rb +91 -0
- data/lib/rsi/serializers.rb +31 -0
- data/lib/rsi/stoplist.rb +72 -0
- data/lib/rsi/stoplist.txt +59 -0
- data/rsi.gemspec +59 -0
- data/setup.rb +1360 -0
- data/tests/suite_all.rb +14 -0
- data/tests/t_analysis.rb +43 -0
- data/tests/t_dictionary.rb +76 -0
- data/tests/t_index.rb +78 -0
- data/tests/t_index_multi.rb +71 -0
- data/version.release +1 -0
- metadata +72 -0
data/tests/suite_all.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#
|
2
|
+
# Test suite for running all RSI tests
|
3
|
+
#
|
4
|
+
require 'test/unit'
|
5
|
+
require 'tests/t_index'
|
6
|
+
require 'tests/t_index_multi'
|
7
|
+
require 'tests/t_dictionary'
|
8
|
+
require 'tests/t_analysis'
|
9
|
+
|
10
|
+
if $0 == __FILE__
|
11
|
+
require 'rsi/logmanager'
|
12
|
+
fh = File.open( "/dev/null", "a" )
|
13
|
+
RSI::LogManager.instance().root_fh = fh
|
14
|
+
end
|
data/tests/t_analysis.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/local/bin/ruby -w
|
2
|
+
require 'fileutils'
|
3
|
+
require 'test/unit'
|
4
|
+
require 'tmpdir'
|
5
|
+
require 'rsi/analysis'
|
6
|
+
require 'rsi/index' # for FIELD_TYPE_TEXT
|
7
|
+
|
8
|
+
class AnalTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
DOC_A = "Weebles wobble but they don't fall down"
|
11
|
+
DOC_B = "The boot is a whale, then?"
|
12
|
+
|
13
|
+
def setup()
|
14
|
+
@a = RSI::DefaultTextAnalyzer.new()
|
15
|
+
end
|
16
|
+
|
17
|
+
def teardown(); end
|
18
|
+
|
19
|
+
def test_types()
|
20
|
+
t = @a.get_field_types()
|
21
|
+
assert_equal( 1, t.size(), "Only one type" )
|
22
|
+
assert_equal( RSI::FIELD_TYPE_TEXT, t['text'], "Text type dictionary" )
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_text()
|
26
|
+
t = @a.tokenize( DOC_A )
|
27
|
+
#puts t['text'].join(":")
|
28
|
+
assert_equal( 5, t['text'].size(), "Phrase contains 5 interesting terms" )
|
29
|
+
assert( t['text'].include?( "DONT" ), "DONT should be in termlist" )
|
30
|
+
t = @a.tokenize( DOC_B )
|
31
|
+
assert_equal( 3, t['text'].size(), "Phrase contains 3 non-stopwords" )
|
32
|
+
assert( ! t['text'].include?( "THE" ), "The is a stopword" )
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_stopwords()
|
36
|
+
source = "dog dogs buggy buggies child children The an a stop"
|
37
|
+
t = @a.tokenize( source )
|
38
|
+
#puts t['text'].join(":")
|
39
|
+
assert_equal( 5, t['text'].size(), "porter.rb doesn't stem 'children' right" )
|
40
|
+
assert( !t['text'].include?( "THE" ), "The is a stopword" )
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
#
|
3
|
+
# Tests for rsi/dictionary.rb
|
4
|
+
#
|
5
|
+
|
6
|
+
require 'test/unit'
|
7
|
+
require 'fileutils'
|
8
|
+
require 'tmpdir'
|
9
|
+
require 'rsi/dictionary'
|
10
|
+
require 'rsi/serializers'
|
11
|
+
require 'rsi/logmanager'
|
12
|
+
|
13
|
+
class DictTest < Test::Unit::TestCase
|
14
|
+
|
15
|
+
def setup()
|
16
|
+
@failed = false
|
17
|
+
@tmp = Dir::tmpdir()
|
18
|
+
fn = "d_#{$$}-#{rand(65536)}"
|
19
|
+
@root = File.join( @tmp, fn )
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_failure( msg, bt )
|
23
|
+
super
|
24
|
+
@failed = true
|
25
|
+
end
|
26
|
+
|
27
|
+
# Delete the temp dir for the index tests
|
28
|
+
def teardown()
|
29
|
+
if @failed
|
30
|
+
print "Test case failed, not cleaning up #@root\n";
|
31
|
+
else
|
32
|
+
FileUtils::rm_rf( @root )
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_create()
|
37
|
+
create_dictionary()
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_reopen()
|
41
|
+
create_dictionary()
|
42
|
+
assert( FileTest.exists?( @root ), "Test dict root should exist" )
|
43
|
+
d = RSI::Dictionary.new( @root )
|
44
|
+
d.serializer = RSI::YAMLSerializer.new()
|
45
|
+
d.open()
|
46
|
+
|
47
|
+
id2 = d.get_termid_for( "STRAWBERRY" )
|
48
|
+
assert_equal( 1, id2, "Should get same termid both times")
|
49
|
+
el = d.get_entry_list( id2 )
|
50
|
+
assert_equal( 1, el.length(), "Should be one entry" )
|
51
|
+
assert_equal( "poo.txt", el[0].docid )
|
52
|
+
assert_equal( 1, el[0].freq )
|
53
|
+
assert_equal( 1, el[0].pos_list.length() )
|
54
|
+
end
|
55
|
+
|
56
|
+
def create_dictionary()
|
57
|
+
assert( !FileTest.exists?( @root ), "Test dict root should not exist" )
|
58
|
+
d = RSI::Dictionary.new( @root )
|
59
|
+
d.serializer = RSI::YAMLSerializer.new()
|
60
|
+
d.open()
|
61
|
+
assert( FileTest.directory?( @root ), "Root should exist" )
|
62
|
+
|
63
|
+
id = d.get_termid_for( "STRAWBERRY", true )
|
64
|
+
d.add_term_entries( "poo.txt", id, [0] )
|
65
|
+
d.store()
|
66
|
+
|
67
|
+
id2 = d.get_termid_for( "STRAWBERRY" )
|
68
|
+
assert_equal( 1, id2, "Should get one termid")
|
69
|
+
el = d.get_entry_list( id2 )
|
70
|
+
assert_equal( 1, el.length(), "Should be one entry" )
|
71
|
+
assert_equal( "poo.txt", el[0].docid )
|
72
|
+
assert_equal( 1, el[0].freq )
|
73
|
+
assert_equal( 1, el[0].pos_list.length() )
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
data/tests/t_index.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#!/usr/local/bin/ruby -w
|
2
|
+
require 'fileutils'
|
3
|
+
require 'test/unit'
|
4
|
+
require 'tmpdir'
|
5
|
+
require 'rsi'
|
6
|
+
require 'rsi/logmanager'
|
7
|
+
|
8
|
+
DOC_A = "The quick brown fox jumped over the lazy dogs"
|
9
|
+
DOC_B = "Every dog has his day"
|
10
|
+
DOC_C = "Bjork, bjork, were you brought by the stork, or were you" +
|
11
|
+
" created from bubbles and cork?"
|
12
|
+
|
13
|
+
class IdxTest < Test::Unit::TestCase
|
14
|
+
|
15
|
+
# Create a temp dir for the indexes in the test
|
16
|
+
def setup()
|
17
|
+
@tmp = Dir::tmpdir()
|
18
|
+
@root = File.join( @tmp, "searchtest.#{$$}.#{rand(65535)}" )
|
19
|
+
Dir.mkdir( @root ) # will raise if root exists, which is what I want
|
20
|
+
@failed = false
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_failure( msg, bt )
|
24
|
+
super
|
25
|
+
@failed = true
|
26
|
+
end
|
27
|
+
|
28
|
+
# Delete the temp dir for the index tests
|
29
|
+
def teardown()
|
30
|
+
if @failed
|
31
|
+
print "Test case failed, not cleaning up #@root\n";
|
32
|
+
else
|
33
|
+
FileUtils::rm_rf( @root )
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Create an index, assert that it contains the right docs and terms
|
38
|
+
def test_basic()
|
39
|
+
indexer = RSI::Indexer.new( @root )
|
40
|
+
indexer.serializer = RSI::YAMLSerializer.new()
|
41
|
+
indexer.open()
|
42
|
+
indexer.add_document( "DOC_A", DOC_A )
|
43
|
+
indexer.add_document( "DOC_B", DOC_B )
|
44
|
+
indexer.add_document( "DOC_C", DOC_C )
|
45
|
+
indexer.flush()
|
46
|
+
assert_finds( indexer )
|
47
|
+
end
|
48
|
+
|
49
|
+
# Creates an index, then asserts that it can be re-opened
|
50
|
+
def __SKIP__test_reopen()
|
51
|
+
test_basic() # build index with 3 docs
|
52
|
+
indexer = RSI::Indexer.new( @root )
|
53
|
+
indexer.serializer = RSI::YAMLSerializer.new()
|
54
|
+
indexer.open()
|
55
|
+
assert_finds( indexer )
|
56
|
+
end
|
57
|
+
|
58
|
+
# Assert that the given index contains the right documents/terms
|
59
|
+
def assert_finds( indexer )
|
60
|
+
a = indexer.find_all( "Bjork" )
|
61
|
+
assert_equal( 1, a.length(), "Should have one hit (DOC_C)" )
|
62
|
+
assert_equal( "DOC_C", a[0], "Should find DOC_C" )
|
63
|
+
|
64
|
+
a = indexer.find_all( "bjork" )
|
65
|
+
assert_equal( 1, a.length(), "(+i) Should have one hit (DOC_C)" )
|
66
|
+
assert_equal( "DOC_C", a[0], "(+i) Should find DOC_C" )
|
67
|
+
|
68
|
+
a = indexer.find_all( "dog" )
|
69
|
+
assert_equal( 2, a.length(), "Should have two hits (DOC_A, DOC_B)" )
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
if $0 == __FILE__
|
75
|
+
RSI::LogManager.instance.root = "."
|
76
|
+
RSI::LogManager.instance.log_filename = "t_index.log"
|
77
|
+
end
|
78
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/local/bin/ruby -w
|
2
|
+
require 'fileutils'
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'test/unit'
|
5
|
+
require 'rsi'
|
6
|
+
|
7
|
+
module RSI
|
8
|
+
|
9
|
+
class TestX2SampleAnalyzer
|
10
|
+
attr_accessor :base
|
11
|
+
def initialize( base=RSI::DefaultTextAnalyzer.new() )
|
12
|
+
@base = base
|
13
|
+
end
|
14
|
+
|
15
|
+
def get_field_types()
|
16
|
+
return { "text" => RSI::FIELD_TYPE_TEXT,
|
17
|
+
"subject" => RSI::FIELD_TYPE_TEXT }
|
18
|
+
end
|
19
|
+
|
20
|
+
def tokenize( content )
|
21
|
+
r = {}
|
22
|
+
content =~ /^(.*?)\n/
|
23
|
+
r['subject'] = @base.tokenize_text( $1 )
|
24
|
+
r['text'] = @base.tokenize_text( content )
|
25
|
+
return r
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class X2SampleAnalyzerTest < Test::Unit::TestCase
|
32
|
+
|
33
|
+
DOC_A = "The subject was a long and pointless one\n" +
|
34
|
+
"\n"+
|
35
|
+
"Also, so was the story.\n\n";
|
36
|
+
|
37
|
+
def setup()
|
38
|
+
@tmp = Dir::tmpdir()
|
39
|
+
@root = File.join( @tmp, "searchtest.#{$$}.#{rand(65535)}" )
|
40
|
+
Dir.mkdir( @root )
|
41
|
+
@failed = false
|
42
|
+
end
|
43
|
+
|
44
|
+
def add_failure( msg, bt )
|
45
|
+
super
|
46
|
+
@failed = true
|
47
|
+
end
|
48
|
+
|
49
|
+
# Delete the temp dir for the index tests
|
50
|
+
def teardown()
|
51
|
+
if @failed
|
52
|
+
print "Test case failed, not cleaning up #@root\n";
|
53
|
+
else
|
54
|
+
FileUtils::rm_rf( @root )
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_sample()
|
59
|
+
indexer = RSI::Indexer.new( @root )
|
60
|
+
indexer.serializer = RSI::YAMLSerializer.new()
|
61
|
+
indexer.analyzer = RSI::TestX2SampleAnalyzer.new()
|
62
|
+
indexer.open()
|
63
|
+
indexer.add_document( "DOC_A", DOC_A )
|
64
|
+
indexer.flush()
|
65
|
+
|
66
|
+
# TODO: queryanalyzers
|
67
|
+
a = indexer.find_all( "pointless subject" )
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
data/version.release
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.4
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.4
|
3
|
+
specification_version: 1
|
4
|
+
name: rsi
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: "0.4"
|
7
|
+
date: 2005-02-04
|
8
|
+
summary: RSI (Ruby Simple Indexer) is a simple full text index
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: gdf@rubyforge.org
|
12
|
+
homepage: http://rsi.rubyforge.org
|
13
|
+
rubyforge_project: rsi
|
14
|
+
description: "RSI is a simple full text search engine implementation in Ruby. It aims to be
|
15
|
+
easily useful within other programs: simple to set up, simple to use. An
|
16
|
+
emphasis has been placed on getting functionality out the door, rather than
|
17
|
+
heavy optimization (that can come later). It still appears to be reasonably
|
18
|
+
fast and efficient (while admitting to have not been heavily profiled...)."
|
19
|
+
autorequire: rsi
|
20
|
+
default_executable: rsi_search.rb
|
21
|
+
bindir: bin
|
22
|
+
has_rdoc: true
|
23
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
24
|
+
requirements:
|
25
|
+
-
|
26
|
+
- ">"
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: 0.0.0
|
29
|
+
version:
|
30
|
+
platform: ruby
|
31
|
+
authors:
|
32
|
+
- Greg Fast
|
33
|
+
files:
|
34
|
+
- LICENSE
|
35
|
+
- Makefile
|
36
|
+
- Manifest
|
37
|
+
- README
|
38
|
+
- TODO
|
39
|
+
- rsi.gemspec
|
40
|
+
- setup.rb
|
41
|
+
- version.release
|
42
|
+
- bin/rsi_search.rb
|
43
|
+
- bin/search_bench.rb
|
44
|
+
- docs/ATTRIB
|
45
|
+
- docs/Changes
|
46
|
+
- docs/Roadmap
|
47
|
+
- lib/rsi.rb
|
48
|
+
- lib/rsi/analysis.rb
|
49
|
+
- lib/rsi/compressed_serializers.rb
|
50
|
+
- lib/rsi/dictionary.rb
|
51
|
+
- lib/rsi/index.rb
|
52
|
+
- lib/rsi/logmanager.rb
|
53
|
+
- lib/rsi/porter.rb
|
54
|
+
- lib/rsi/query.rb
|
55
|
+
- lib/rsi/rsi_intro.rb
|
56
|
+
- lib/rsi/serializers.rb
|
57
|
+
- lib/rsi/stoplist.rb
|
58
|
+
- lib/rsi/stoplist.txt
|
59
|
+
- tests/suite_all.rb
|
60
|
+
- tests/t_analysis.rb
|
61
|
+
- tests/t_index.rb
|
62
|
+
- tests/t_index_multi.rb
|
63
|
+
- tests/t_dictionary.rb
|
64
|
+
test_files:
|
65
|
+
- tests/suite_all.rb
|
66
|
+
rdoc_options: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
executables:
|
69
|
+
- rsi_search.rb
|
70
|
+
extensions: []
|
71
|
+
requirements: []
|
72
|
+
dependencies: []
|