rsi 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +25 -0
- data/Makefile +24 -0
- data/Manifest +30 -0
- data/README +49 -0
- data/TODO +30 -0
- data/bin/rsi_search.rb +50 -0
- data/bin/search_bench.rb +47 -0
- data/docs/ATTRIB +14 -0
- data/docs/Changes +25 -0
- data/docs/Roadmap +41 -0
- data/lib/rsi.rb +40 -0
- data/lib/rsi/analysis.rb +79 -0
- data/lib/rsi/compressed_serializers.rb +60 -0
- data/lib/rsi/dictionary.rb +232 -0
- data/lib/rsi/index.rb +245 -0
- data/lib/rsi/logmanager.rb +105 -0
- data/lib/rsi/porter.rb +213 -0
- data/lib/rsi/query.rb +98 -0
- data/lib/rsi/rsi_intro.rb +91 -0
- data/lib/rsi/serializers.rb +31 -0
- data/lib/rsi/stoplist.rb +72 -0
- data/lib/rsi/stoplist.txt +59 -0
- data/rsi.gemspec +59 -0
- data/setup.rb +1360 -0
- data/tests/suite_all.rb +14 -0
- data/tests/t_analysis.rb +43 -0
- data/tests/t_dictionary.rb +76 -0
- data/tests/t_index.rb +78 -0
- data/tests/t_index_multi.rb +71 -0
- data/version.release +1 -0
- metadata +72 -0
data/tests/suite_all.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#
|
2
|
+
# Test suite for running all RSI tests
|
3
|
+
#
|
4
|
+
require 'test/unit'
|
5
|
+
require 'tests/t_index'
|
6
|
+
require 'tests/t_index_multi'
|
7
|
+
require 'tests/t_dictionary'
|
8
|
+
require 'tests/t_analysis'
|
9
|
+
|
10
|
+
if $0 == __FILE__
|
11
|
+
require 'rsi/logmanager'
|
12
|
+
fh = File.open( "/dev/null", "a" )
|
13
|
+
RSI::LogManager.instance().root_fh = fh
|
14
|
+
end
|
data/tests/t_analysis.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/local/bin/ruby -w
|
2
|
+
require 'fileutils'
|
3
|
+
require 'test/unit'
|
4
|
+
require 'tmpdir'
|
5
|
+
require 'rsi/analysis'
|
6
|
+
require 'rsi/index' # for FIELD_TYPE_TEXT
|
7
|
+
|
8
|
+
class AnalTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
DOC_A = "Weebles wobble but they don't fall down"
|
11
|
+
DOC_B = "The boot is a whale, then?"
|
12
|
+
|
13
|
+
def setup()
|
14
|
+
@a = RSI::DefaultTextAnalyzer.new()
|
15
|
+
end
|
16
|
+
|
17
|
+
def teardown(); end
|
18
|
+
|
19
|
+
def test_types()
|
20
|
+
t = @a.get_field_types()
|
21
|
+
assert_equal( 1, t.size(), "Only one type" )
|
22
|
+
assert_equal( RSI::FIELD_TYPE_TEXT, t['text'], "Text type dictionary" )
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_text()
|
26
|
+
t = @a.tokenize( DOC_A )
|
27
|
+
#puts t['text'].join(":")
|
28
|
+
assert_equal( 5, t['text'].size(), "Phrase contains 5 interesting terms" )
|
29
|
+
assert( t['text'].include?( "DONT" ), "DONT should be in termlist" )
|
30
|
+
t = @a.tokenize( DOC_B )
|
31
|
+
assert_equal( 3, t['text'].size(), "Phrase contains 3 non-stopwords" )
|
32
|
+
assert( ! t['text'].include?( "THE" ), "The is a stopword" )
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_stopwords()
|
36
|
+
source = "dog dogs buggy buggies child children The an a stop"
|
37
|
+
t = @a.tokenize( source )
|
38
|
+
#puts t['text'].join(":")
|
39
|
+
assert_equal( 5, t['text'].size(), "porter.rb doesn't stem 'children' right" )
|
40
|
+
assert( !t['text'].include?( "THE" ), "The is a stopword" )
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
#
|
3
|
+
# Tests for rsi/dictionary.rb
|
4
|
+
#
|
5
|
+
|
6
|
+
require 'test/unit'
|
7
|
+
require 'fileutils'
|
8
|
+
require 'tmpdir'
|
9
|
+
require 'rsi/dictionary'
|
10
|
+
require 'rsi/serializers'
|
11
|
+
require 'rsi/logmanager'
|
12
|
+
|
13
|
+
class DictTest < Test::Unit::TestCase
|
14
|
+
|
15
|
+
def setup()
|
16
|
+
@failed = false
|
17
|
+
@tmp = Dir::tmpdir()
|
18
|
+
fn = "d_#{$$}-#{rand(65536)}"
|
19
|
+
@root = File.join( @tmp, fn )
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_failure( msg, bt )
|
23
|
+
super
|
24
|
+
@failed = true
|
25
|
+
end
|
26
|
+
|
27
|
+
# Delete the temp dir for the index tests
|
28
|
+
def teardown()
|
29
|
+
if @failed
|
30
|
+
print "Test case failed, not cleaning up #@root\n";
|
31
|
+
else
|
32
|
+
FileUtils::rm_rf( @root )
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_create()
|
37
|
+
create_dictionary()
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_reopen()
|
41
|
+
create_dictionary()
|
42
|
+
assert( FileTest.exists?( @root ), "Test dict root should exist" )
|
43
|
+
d = RSI::Dictionary.new( @root )
|
44
|
+
d.serializer = RSI::YAMLSerializer.new()
|
45
|
+
d.open()
|
46
|
+
|
47
|
+
id2 = d.get_termid_for( "STRAWBERRY" )
|
48
|
+
assert_equal( 1, id2, "Should get same termid both times")
|
49
|
+
el = d.get_entry_list( id2 )
|
50
|
+
assert_equal( 1, el.length(), "Should be one entry" )
|
51
|
+
assert_equal( "poo.txt", el[0].docid )
|
52
|
+
assert_equal( 1, el[0].freq )
|
53
|
+
assert_equal( 1, el[0].pos_list.length() )
|
54
|
+
end
|
55
|
+
|
56
|
+
def create_dictionary()
|
57
|
+
assert( !FileTest.exists?( @root ), "Test dict root should not exist" )
|
58
|
+
d = RSI::Dictionary.new( @root )
|
59
|
+
d.serializer = RSI::YAMLSerializer.new()
|
60
|
+
d.open()
|
61
|
+
assert( FileTest.directory?( @root ), "Root should exist" )
|
62
|
+
|
63
|
+
id = d.get_termid_for( "STRAWBERRY", true )
|
64
|
+
d.add_term_entries( "poo.txt", id, [0] )
|
65
|
+
d.store()
|
66
|
+
|
67
|
+
id2 = d.get_termid_for( "STRAWBERRY" )
|
68
|
+
assert_equal( 1, id2, "Should get one termid")
|
69
|
+
el = d.get_entry_list( id2 )
|
70
|
+
assert_equal( 1, el.length(), "Should be one entry" )
|
71
|
+
assert_equal( "poo.txt", el[0].docid )
|
72
|
+
assert_equal( 1, el[0].freq )
|
73
|
+
assert_equal( 1, el[0].pos_list.length() )
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
data/tests/t_index.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#!/usr/local/bin/ruby -w
|
2
|
+
require 'fileutils'
|
3
|
+
require 'test/unit'
|
4
|
+
require 'tmpdir'
|
5
|
+
require 'rsi'
|
6
|
+
require 'rsi/logmanager'
|
7
|
+
|
8
|
+
DOC_A = "The quick brown fox jumped over the lazy dogs"
|
9
|
+
DOC_B = "Every dog has his day"
|
10
|
+
DOC_C = "Bjork, bjork, were you brought by the stork, or were you" +
|
11
|
+
" created from bubbles and cork?"
|
12
|
+
|
13
|
+
class IdxTest < Test::Unit::TestCase
|
14
|
+
|
15
|
+
# Create a temp dir for the indexes in the test
|
16
|
+
def setup()
|
17
|
+
@tmp = Dir::tmpdir()
|
18
|
+
@root = File.join( @tmp, "searchtest.#{$$}.#{rand(65535)}" )
|
19
|
+
Dir.mkdir( @root ) # will raise if root exists, which is what I want
|
20
|
+
@failed = false
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_failure( msg, bt )
|
24
|
+
super
|
25
|
+
@failed = true
|
26
|
+
end
|
27
|
+
|
28
|
+
# Delete the temp dir for the index tests
|
29
|
+
def teardown()
|
30
|
+
if @failed
|
31
|
+
print "Test case failed, not cleaning up #@root\n";
|
32
|
+
else
|
33
|
+
FileUtils::rm_rf( @root )
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Create an index, assert that it contains the right docs and terms
|
38
|
+
def test_basic()
|
39
|
+
indexer = RSI::Indexer.new( @root )
|
40
|
+
indexer.serializer = RSI::YAMLSerializer.new()
|
41
|
+
indexer.open()
|
42
|
+
indexer.add_document( "DOC_A", DOC_A )
|
43
|
+
indexer.add_document( "DOC_B", DOC_B )
|
44
|
+
indexer.add_document( "DOC_C", DOC_C )
|
45
|
+
indexer.flush()
|
46
|
+
assert_finds( indexer )
|
47
|
+
end
|
48
|
+
|
49
|
+
# Creates an index, then asserts that it can be re-opened
|
50
|
+
def __SKIP__test_reopen()
|
51
|
+
test_basic() # build index with 3 docs
|
52
|
+
indexer = RSI::Indexer.new( @root )
|
53
|
+
indexer.serializer = RSI::YAMLSerializer.new()
|
54
|
+
indexer.open()
|
55
|
+
assert_finds( indexer )
|
56
|
+
end
|
57
|
+
|
58
|
+
# Assert that the given index contains the right documents/terms
|
59
|
+
def assert_finds( indexer )
|
60
|
+
a = indexer.find_all( "Bjork" )
|
61
|
+
assert_equal( 1, a.length(), "Should have one hit (DOC_C)" )
|
62
|
+
assert_equal( "DOC_C", a[0], "Should find DOC_C" )
|
63
|
+
|
64
|
+
a = indexer.find_all( "bjork" )
|
65
|
+
assert_equal( 1, a.length(), "(+i) Should have one hit (DOC_C)" )
|
66
|
+
assert_equal( "DOC_C", a[0], "(+i) Should find DOC_C" )
|
67
|
+
|
68
|
+
a = indexer.find_all( "dog" )
|
69
|
+
assert_equal( 2, a.length(), "Should have two hits (DOC_A, DOC_B)" )
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
if $0 == __FILE__
|
75
|
+
RSI::LogManager.instance.root = "."
|
76
|
+
RSI::LogManager.instance.log_filename = "t_index.log"
|
77
|
+
end
|
78
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/local/bin/ruby -w
|
2
|
+
require 'fileutils'
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'test/unit'
|
5
|
+
require 'rsi'
|
6
|
+
|
7
|
+
module RSI
|
8
|
+
|
9
|
+
class TestX2SampleAnalyzer
|
10
|
+
attr_accessor :base
|
11
|
+
def initialize( base=RSI::DefaultTextAnalyzer.new() )
|
12
|
+
@base = base
|
13
|
+
end
|
14
|
+
|
15
|
+
def get_field_types()
|
16
|
+
return { "text" => RSI::FIELD_TYPE_TEXT,
|
17
|
+
"subject" => RSI::FIELD_TYPE_TEXT }
|
18
|
+
end
|
19
|
+
|
20
|
+
def tokenize( content )
|
21
|
+
r = {}
|
22
|
+
content =~ /^(.*?)\n/
|
23
|
+
r['subject'] = @base.tokenize_text( $1 )
|
24
|
+
r['text'] = @base.tokenize_text( content )
|
25
|
+
return r
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class X2SampleAnalyzerTest < Test::Unit::TestCase
|
32
|
+
|
33
|
+
DOC_A = "The subject was a long and pointless one\n" +
|
34
|
+
"\n"+
|
35
|
+
"Also, so was the story.\n\n";
|
36
|
+
|
37
|
+
def setup()
|
38
|
+
@tmp = Dir::tmpdir()
|
39
|
+
@root = File.join( @tmp, "searchtest.#{$$}.#{rand(65535)}" )
|
40
|
+
Dir.mkdir( @root )
|
41
|
+
@failed = false
|
42
|
+
end
|
43
|
+
|
44
|
+
def add_failure( msg, bt )
|
45
|
+
super
|
46
|
+
@failed = true
|
47
|
+
end
|
48
|
+
|
49
|
+
# Delete the temp dir for the index tests
|
50
|
+
def teardown()
|
51
|
+
if @failed
|
52
|
+
print "Test case failed, not cleaning up #@root\n";
|
53
|
+
else
|
54
|
+
FileUtils::rm_rf( @root )
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_sample()
|
59
|
+
indexer = RSI::Indexer.new( @root )
|
60
|
+
indexer.serializer = RSI::YAMLSerializer.new()
|
61
|
+
indexer.analyzer = RSI::TestX2SampleAnalyzer.new()
|
62
|
+
indexer.open()
|
63
|
+
indexer.add_document( "DOC_A", DOC_A )
|
64
|
+
indexer.flush()
|
65
|
+
|
66
|
+
# TODO: queryanalyzers
|
67
|
+
a = indexer.find_all( "pointless subject" )
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
data/version.release
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.4
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.4
|
3
|
+
specification_version: 1
|
4
|
+
name: rsi
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: "0.4"
|
7
|
+
date: 2005-02-04
|
8
|
+
summary: RSI (Ruby Simple Indexer) is a simple full text index
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: gdf@rubyforge.org
|
12
|
+
homepage: http://rsi.rubyforge.org
|
13
|
+
rubyforge_project: rsi
|
14
|
+
description: "RSI is a simple full text search engine implementation in Ruby. It aims to be
|
15
|
+
easily useful within other programs: simple to set up, simple to use. An
|
16
|
+
emphasis has been placed on getting functionality out the door, rather than
|
17
|
+
heavy optimization (that can come later). It still appears to be reasonably
|
18
|
+
fast and efficient (while admitting to have not been heavily profiled...)."
|
19
|
+
autorequire: rsi
|
20
|
+
default_executable: rsi_search.rb
|
21
|
+
bindir: bin
|
22
|
+
has_rdoc: true
|
23
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
24
|
+
requirements:
|
25
|
+
-
|
26
|
+
- ">"
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: 0.0.0
|
29
|
+
version:
|
30
|
+
platform: ruby
|
31
|
+
authors:
|
32
|
+
- Greg Fast
|
33
|
+
files:
|
34
|
+
- LICENSE
|
35
|
+
- Makefile
|
36
|
+
- Manifest
|
37
|
+
- README
|
38
|
+
- TODO
|
39
|
+
- rsi.gemspec
|
40
|
+
- setup.rb
|
41
|
+
- version.release
|
42
|
+
- bin/rsi_search.rb
|
43
|
+
- bin/search_bench.rb
|
44
|
+
- docs/ATTRIB
|
45
|
+
- docs/Changes
|
46
|
+
- docs/Roadmap
|
47
|
+
- lib/rsi.rb
|
48
|
+
- lib/rsi/analysis.rb
|
49
|
+
- lib/rsi/compressed_serializers.rb
|
50
|
+
- lib/rsi/dictionary.rb
|
51
|
+
- lib/rsi/index.rb
|
52
|
+
- lib/rsi/logmanager.rb
|
53
|
+
- lib/rsi/porter.rb
|
54
|
+
- lib/rsi/query.rb
|
55
|
+
- lib/rsi/rsi_intro.rb
|
56
|
+
- lib/rsi/serializers.rb
|
57
|
+
- lib/rsi/stoplist.rb
|
58
|
+
- lib/rsi/stoplist.txt
|
59
|
+
- tests/suite_all.rb
|
60
|
+
- tests/t_analysis.rb
|
61
|
+
- tests/t_index.rb
|
62
|
+
- tests/t_index_multi.rb
|
63
|
+
- tests/t_dictionary.rb
|
64
|
+
test_files:
|
65
|
+
- tests/suite_all.rb
|
66
|
+
rdoc_options: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
executables:
|
69
|
+
- rsi_search.rb
|
70
|
+
extensions: []
|
71
|
+
requirements: []
|
72
|
+
dependencies: []
|