rsi 0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ #
2
+ # Test suite for running all RSI tests
3
+ #
4
+ require 'test/unit'
5
+ require 'tests/t_index'
6
+ require 'tests/t_index_multi'
7
+ require 'tests/t_dictionary'
8
+ require 'tests/t_analysis'
9
+
10
+ if $0 == __FILE__
11
+ require 'rsi/logmanager'
12
+ fh = File.open( "/dev/null", "a" )
13
+ RSI::LogManager.instance().root_fh = fh
14
+ end
@@ -0,0 +1,43 @@
1
+ #!/usr/local/bin/ruby -w
2
+ require 'fileutils'
3
+ require 'test/unit'
4
+ require 'tmpdir'
5
+ require 'rsi/analysis'
6
+ require 'rsi/index' # for FIELD_TYPE_TEXT
7
+
8
+ class AnalTest < Test::Unit::TestCase
9
+
10
+ DOC_A = "Weebles wobble but they don't fall down"
11
+ DOC_B = "The boot is a whale, then?"
12
+
13
+ def setup()
14
+ @a = RSI::DefaultTextAnalyzer.new()
15
+ end
16
+
17
+ def teardown(); end
18
+
19
+ def test_types()
20
+ t = @a.get_field_types()
21
+ assert_equal( 1, t.size(), "Only one type" )
22
+ assert_equal( RSI::FIELD_TYPE_TEXT, t['text'], "Text type dictionary" )
23
+ end
24
+
25
+ def test_text()
26
+ t = @a.tokenize( DOC_A )
27
+ #puts t['text'].join(":")
28
+ assert_equal( 5, t['text'].size(), "Phrase contains 5 interesting terms" )
29
+ assert( t['text'].include?( "DONT" ), "DONT should be in termlist" )
30
+ t = @a.tokenize( DOC_B )
31
+ assert_equal( 3, t['text'].size(), "Phrase contains 3 non-stopwords" )
32
+ assert( ! t['text'].include?( "THE" ), "The is a stopword" )
33
+ end
34
+
35
+ def test_stopwords()
36
+ source = "dog dogs buggy buggies child children The an a stop"
37
+ t = @a.tokenize( source )
38
+ #puts t['text'].join(":")
39
+ assert_equal( 5, t['text'].size(), "porter.rb doesn't stem 'children' right" )
40
+ assert( !t['text'].include?( "THE" ), "The is a stopword" )
41
+ end
42
+
43
+ end
@@ -0,0 +1,76 @@
1
+ # -*- ruby -*-
2
+ #
3
+ # Tests for rsi/dictionary.rb
4
+ #
5
+
6
+ require 'test/unit'
7
+ require 'fileutils'
8
+ require 'tmpdir'
9
+ require 'rsi/dictionary'
10
+ require 'rsi/serializers'
11
+ require 'rsi/logmanager'
12
+
13
+ class DictTest < Test::Unit::TestCase
14
+
15
+ def setup()
16
+ @failed = false
17
+ @tmp = Dir::tmpdir()
18
+ fn = "d_#{$$}-#{rand(65536)}"
19
+ @root = File.join( @tmp, fn )
20
+ end
21
+
22
+ def add_failure( msg, bt )
23
+ super
24
+ @failed = true
25
+ end
26
+
27
+ # Delete the temp dir for the index tests
28
+ def teardown()
29
+ if @failed
30
+ print "Test case failed, not cleaning up #@root\n";
31
+ else
32
+ FileUtils::rm_rf( @root )
33
+ end
34
+ end
35
+
36
+ def test_create()
37
+ create_dictionary()
38
+ end
39
+
40
+ def test_reopen()
41
+ create_dictionary()
42
+ assert( FileTest.exists?( @root ), "Test dict root should exist" )
43
+ d = RSI::Dictionary.new( @root )
44
+ d.serializer = RSI::YAMLSerializer.new()
45
+ d.open()
46
+
47
+ id2 = d.get_termid_for( "STRAWBERRY" )
48
+ assert_equal( 1, id2, "Should get same termid both times")
49
+ el = d.get_entry_list( id2 )
50
+ assert_equal( 1, el.length(), "Should be one entry" )
51
+ assert_equal( "poo.txt", el[0].docid )
52
+ assert_equal( 1, el[0].freq )
53
+ assert_equal( 1, el[0].pos_list.length() )
54
+ end
55
+
56
+ def create_dictionary()
57
+ assert( !FileTest.exists?( @root ), "Test dict root should not exist" )
58
+ d = RSI::Dictionary.new( @root )
59
+ d.serializer = RSI::YAMLSerializer.new()
60
+ d.open()
61
+ assert( FileTest.directory?( @root ), "Root should exist" )
62
+
63
+ id = d.get_termid_for( "STRAWBERRY", true )
64
+ d.add_term_entries( "poo.txt", id, [0] )
65
+ d.store()
66
+
67
+ id2 = d.get_termid_for( "STRAWBERRY" )
68
+ assert_equal( 1, id2, "Should get one termid")
69
+ el = d.get_entry_list( id2 )
70
+ assert_equal( 1, el.length(), "Should be one entry" )
71
+ assert_equal( "poo.txt", el[0].docid )
72
+ assert_equal( 1, el[0].freq )
73
+ assert_equal( 1, el[0].pos_list.length() )
74
+ end
75
+
76
+ end
@@ -0,0 +1,78 @@
1
+ #!/usr/local/bin/ruby -w
2
+ require 'fileutils'
3
+ require 'test/unit'
4
+ require 'tmpdir'
5
+ require 'rsi'
6
+ require 'rsi/logmanager'
7
+
8
+ DOC_A = "The quick brown fox jumped over the lazy dogs"
9
+ DOC_B = "Every dog has his day"
10
+ DOC_C = "Bjork, bjork, were you brought by the stork, or were you" +
11
+ " created from bubbles and cork?"
12
+
13
+ class IdxTest < Test::Unit::TestCase
14
+
15
+ # Create a temp dir for the indexes in the test
16
+ def setup()
17
+ @tmp = Dir::tmpdir()
18
+ @root = File.join( @tmp, "searchtest.#{$$}.#{rand(65535)}" )
19
+ Dir.mkdir( @root ) # will raise if root exists, which is what I want
20
+ @failed = false
21
+ end
22
+
23
+ def add_failure( msg, bt )
24
+ super
25
+ @failed = true
26
+ end
27
+
28
+ # Delete the temp dir for the index tests
29
+ def teardown()
30
+ if @failed
31
+ print "Test case failed, not cleaning up #@root\n";
32
+ else
33
+ FileUtils::rm_rf( @root )
34
+ end
35
+ end
36
+
37
+ # Create an index, assert that it contains the right docs and terms
38
+ def test_basic()
39
+ indexer = RSI::Indexer.new( @root )
40
+ indexer.serializer = RSI::YAMLSerializer.new()
41
+ indexer.open()
42
+ indexer.add_document( "DOC_A", DOC_A )
43
+ indexer.add_document( "DOC_B", DOC_B )
44
+ indexer.add_document( "DOC_C", DOC_C )
45
+ indexer.flush()
46
+ assert_finds( indexer )
47
+ end
48
+
49
+ # Creates an index, then asserts that it can be re-opened
50
+ def __SKIP__test_reopen()
51
+ test_basic() # build index with 3 docs
52
+ indexer = RSI::Indexer.new( @root )
53
+ indexer.serializer = RSI::YAMLSerializer.new()
54
+ indexer.open()
55
+ assert_finds( indexer )
56
+ end
57
+
58
+ # Assert that the given index contains the right documents/terms
59
+ def assert_finds( indexer )
60
+ a = indexer.find_all( "Bjork" )
61
+ assert_equal( 1, a.length(), "Should have one hit (DOC_C)" )
62
+ assert_equal( "DOC_C", a[0], "Should find DOC_C" )
63
+
64
+ a = indexer.find_all( "bjork" )
65
+ assert_equal( 1, a.length(), "(+i) Should have one hit (DOC_C)" )
66
+ assert_equal( "DOC_C", a[0], "(+i) Should find DOC_C" )
67
+
68
+ a = indexer.find_all( "dog" )
69
+ assert_equal( 2, a.length(), "Should have two hits (DOC_A, DOC_B)" )
70
+ end
71
+
72
+ end
73
+
74
+ if $0 == __FILE__
75
+ RSI::LogManager.instance.root = "."
76
+ RSI::LogManager.instance.log_filename = "t_index.log"
77
+ end
78
+
@@ -0,0 +1,71 @@
1
+ #!/usr/local/bin/ruby -w
2
+ require 'fileutils'
3
+ require 'tmpdir'
4
+ require 'test/unit'
5
+ require 'rsi'
6
+
7
+ module RSI
8
+
9
+ class TestX2SampleAnalyzer
10
+ attr_accessor :base
11
+ def initialize( base=RSI::DefaultTextAnalyzer.new() )
12
+ @base = base
13
+ end
14
+
15
+ def get_field_types()
16
+ return { "text" => RSI::FIELD_TYPE_TEXT,
17
+ "subject" => RSI::FIELD_TYPE_TEXT }
18
+ end
19
+
20
+ def tokenize( content )
21
+ r = {}
22
+ content =~ /^(.*?)\n/
23
+ r['subject'] = @base.tokenize_text( $1 )
24
+ r['text'] = @base.tokenize_text( content )
25
+ return r
26
+ end
27
+
28
+ end
29
+ end
30
+
31
+ class X2SampleAnalyzerTest < Test::Unit::TestCase
32
+
33
+ DOC_A = "The subject was a long and pointless one\n" +
34
+ "\n"+
35
+ "Also, so was the story.\n\n";
36
+
37
+ def setup()
38
+ @tmp = Dir::tmpdir()
39
+ @root = File.join( @tmp, "searchtest.#{$$}.#{rand(65535)}" )
40
+ Dir.mkdir( @root )
41
+ @failed = false
42
+ end
43
+
44
+ def add_failure( msg, bt )
45
+ super
46
+ @failed = true
47
+ end
48
+
49
+ # Delete the temp dir for the index tests
50
+ def teardown()
51
+ if @failed
52
+ print "Test case failed, not cleaning up #@root\n";
53
+ else
54
+ FileUtils::rm_rf( @root )
55
+ end
56
+ end
57
+
58
+ def test_sample()
59
+ indexer = RSI::Indexer.new( @root )
60
+ indexer.serializer = RSI::YAMLSerializer.new()
61
+ indexer.analyzer = RSI::TestX2SampleAnalyzer.new()
62
+ indexer.open()
63
+ indexer.add_document( "DOC_A", DOC_A )
64
+ indexer.flush()
65
+
66
+ # TODO: queryanalyzers
67
+ a = indexer.find_all( "pointless subject" )
68
+ end
69
+
70
+ end
71
+
@@ -0,0 +1 @@
1
+ 0.4
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.4
3
+ specification_version: 1
4
+ name: rsi
5
+ version: !ruby/object:Gem::Version
6
+ version: "0.4"
7
+ date: 2005-02-04
8
+ summary: RSI (Ruby Simple Indexer) is a simple full text index
9
+ require_paths:
10
+ - lib
11
+ email: gdf@rubyforge.org
12
+ homepage: http://rsi.rubyforge.org
13
+ rubyforge_project: rsi
14
+ description: "RSI is a simple full text search engine implementation in Ruby. It aims to be
15
+ easily useful within other programs: simple to set up, simple to use. An
16
+ emphasis has been placed on getting functionality out the door, rather than
17
+ heavy optimization (that can come later). It still appears to be reasonably
18
+ fast and efficient (while admitting to have not been heavily profiled...)."
19
+ autorequire: rsi
20
+ default_executable: rsi_search.rb
21
+ bindir: bin
22
+ has_rdoc: true
23
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
24
+ requirements:
25
+ -
26
+ - ">"
27
+ - !ruby/object:Gem::Version
28
+ version: 0.0.0
29
+ version:
30
+ platform: ruby
31
+ authors:
32
+ - Greg Fast
33
+ files:
34
+ - LICENSE
35
+ - Makefile
36
+ - Manifest
37
+ - README
38
+ - TODO
39
+ - rsi.gemspec
40
+ - setup.rb
41
+ - version.release
42
+ - bin/rsi_search.rb
43
+ - bin/search_bench.rb
44
+ - docs/ATTRIB
45
+ - docs/Changes
46
+ - docs/Roadmap
47
+ - lib/rsi.rb
48
+ - lib/rsi/analysis.rb
49
+ - lib/rsi/compressed_serializers.rb
50
+ - lib/rsi/dictionary.rb
51
+ - lib/rsi/index.rb
52
+ - lib/rsi/logmanager.rb
53
+ - lib/rsi/porter.rb
54
+ - lib/rsi/query.rb
55
+ - lib/rsi/rsi_intro.rb
56
+ - lib/rsi/serializers.rb
57
+ - lib/rsi/stoplist.rb
58
+ - lib/rsi/stoplist.txt
59
+ - tests/suite_all.rb
60
+ - tests/t_analysis.rb
61
+ - tests/t_index.rb
62
+ - tests/t_index_multi.rb
63
+ - tests/t_dictionary.rb
64
+ test_files:
65
+ - tests/suite_all.rb
66
+ rdoc_options: []
67
+ extra_rdoc_files: []
68
+ executables:
69
+ - rsi_search.rb
70
+ extensions: []
71
+ requirements: []
72
+ dependencies: []