rsi 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ #
2
+ # Test suite for running all RSI tests
3
+ #
4
+ require 'test/unit'
5
+ require 'tests/t_index'
6
+ require 'tests/t_index_multi'
7
+ require 'tests/t_dictionary'
8
+ require 'tests/t_analysis'
9
+
10
+ if $0 == __FILE__
11
+ require 'rsi/logmanager'
12
+ fh = File.open( "/dev/null", "a" )
13
+ RSI::LogManager.instance().root_fh = fh
14
+ end
@@ -0,0 +1,43 @@
1
+ #!/usr/local/bin/ruby -w
2
+ require 'fileutils'
3
+ require 'test/unit'
4
+ require 'tmpdir'
5
+ require 'rsi/analysis'
6
+ require 'rsi/index' # for FIELD_TYPE_TEXT
7
+
8
+ class AnalTest < Test::Unit::TestCase
9
+
10
+ DOC_A = "Weebles wobble but they don't fall down"
11
+ DOC_B = "The boot is a whale, then?"
12
+
13
+ def setup()
14
+ @a = RSI::DefaultTextAnalyzer.new()
15
+ end
16
+
17
+ def teardown(); end
18
+
19
+ def test_types()
20
+ t = @a.get_field_types()
21
+ assert_equal( 1, t.size(), "Only one type" )
22
+ assert_equal( RSI::FIELD_TYPE_TEXT, t['text'], "Text type dictionary" )
23
+ end
24
+
25
+ def test_text()
26
+ t = @a.tokenize( DOC_A )
27
+ #puts t['text'].join(":")
28
+ assert_equal( 5, t['text'].size(), "Phrase contains 5 interesting terms" )
29
+ assert( t['text'].include?( "DONT" ), "DONT should be in termlist" )
30
+ t = @a.tokenize( DOC_B )
31
+ assert_equal( 3, t['text'].size(), "Phrase contains 3 non-stopwords" )
32
+ assert( ! t['text'].include?( "THE" ), "The is a stopword" )
33
+ end
34
+
35
+ def test_stopwords()
36
+ source = "dog dogs buggy buggies child children The an a stop"
37
+ t = @a.tokenize( source )
38
+ #puts t['text'].join(":")
39
+ assert_equal( 5, t['text'].size(), "porter.rb doesn't stem 'children' right" )
40
+ assert( !t['text'].include?( "THE" ), "The is a stopword" )
41
+ end
42
+
43
+ end
@@ -0,0 +1,76 @@
1
+ # -*- ruby -*-
2
+ #
3
+ # Tests for rsi/dictionary.rb
4
+ #
5
+
6
+ require 'test/unit'
7
+ require 'fileutils'
8
+ require 'tmpdir'
9
+ require 'rsi/dictionary'
10
+ require 'rsi/serializers'
11
+ require 'rsi/logmanager'
12
+
13
+ class DictTest < Test::Unit::TestCase
14
+
15
+ def setup()
16
+ @failed = false
17
+ @tmp = Dir::tmpdir()
18
+ fn = "d_#{$$}-#{rand(65536)}"
19
+ @root = File.join( @tmp, fn )
20
+ end
21
+
22
+ def add_failure( msg, bt )
23
+ super
24
+ @failed = true
25
+ end
26
+
27
+ # Delete the temp dir for the index tests
28
+ def teardown()
29
+ if @failed
30
+ print "Test case failed, not cleaning up #@root\n";
31
+ else
32
+ FileUtils::rm_rf( @root )
33
+ end
34
+ end
35
+
36
+ def test_create()
37
+ create_dictionary()
38
+ end
39
+
40
+ def test_reopen()
41
+ create_dictionary()
42
+ assert( FileTest.exists?( @root ), "Test dict root should exist" )
43
+ d = RSI::Dictionary.new( @root )
44
+ d.serializer = RSI::YAMLSerializer.new()
45
+ d.open()
46
+
47
+ id2 = d.get_termid_for( "STRAWBERRY" )
48
+ assert_equal( 1, id2, "Should get same termid both times")
49
+ el = d.get_entry_list( id2 )
50
+ assert_equal( 1, el.length(), "Should be one entry" )
51
+ assert_equal( "poo.txt", el[0].docid )
52
+ assert_equal( 1, el[0].freq )
53
+ assert_equal( 1, el[0].pos_list.length() )
54
+ end
55
+
56
+ def create_dictionary()
57
+ assert( !FileTest.exists?( @root ), "Test dict root should not exist" )
58
+ d = RSI::Dictionary.new( @root )
59
+ d.serializer = RSI::YAMLSerializer.new()
60
+ d.open()
61
+ assert( FileTest.directory?( @root ), "Root should exist" )
62
+
63
+ id = d.get_termid_for( "STRAWBERRY", true )
64
+ d.add_term_entries( "poo.txt", id, [0] )
65
+ d.store()
66
+
67
+ id2 = d.get_termid_for( "STRAWBERRY" )
68
+ assert_equal( 1, id2, "Should get one termid")
69
+ el = d.get_entry_list( id2 )
70
+ assert_equal( 1, el.length(), "Should be one entry" )
71
+ assert_equal( "poo.txt", el[0].docid )
72
+ assert_equal( 1, el[0].freq )
73
+ assert_equal( 1, el[0].pos_list.length() )
74
+ end
75
+
76
+ end
@@ -0,0 +1,78 @@
1
+ #!/usr/local/bin/ruby -w
2
+ require 'fileutils'
3
+ require 'test/unit'
4
+ require 'tmpdir'
5
+ require 'rsi'
6
+ require 'rsi/logmanager'
7
+
8
+ DOC_A = "The quick brown fox jumped over the lazy dogs"
9
+ DOC_B = "Every dog has his day"
10
+ DOC_C = "Bjork, bjork, were you brought by the stork, or were you" +
11
+ " created from bubbles and cork?"
12
+
13
+ class IdxTest < Test::Unit::TestCase
14
+
15
+ # Create a temp dir for the indexes in the test
16
+ def setup()
17
+ @tmp = Dir::tmpdir()
18
+ @root = File.join( @tmp, "searchtest.#{$$}.#{rand(65535)}" )
19
+ Dir.mkdir( @root ) # will raise if root exists, which is what I want
20
+ @failed = false
21
+ end
22
+
23
+ def add_failure( msg, bt )
24
+ super
25
+ @failed = true
26
+ end
27
+
28
+ # Delete the temp dir for the index tests
29
+ def teardown()
30
+ if @failed
31
+ print "Test case failed, not cleaning up #@root\n";
32
+ else
33
+ FileUtils::rm_rf( @root )
34
+ end
35
+ end
36
+
37
+ # Create an index, assert that it contains the right docs and terms
38
+ def test_basic()
39
+ indexer = RSI::Indexer.new( @root )
40
+ indexer.serializer = RSI::YAMLSerializer.new()
41
+ indexer.open()
42
+ indexer.add_document( "DOC_A", DOC_A )
43
+ indexer.add_document( "DOC_B", DOC_B )
44
+ indexer.add_document( "DOC_C", DOC_C )
45
+ indexer.flush()
46
+ assert_finds( indexer )
47
+ end
48
+
49
+ # Creates an index, then asserts that it can be re-opened
50
+ def __SKIP__test_reopen()
51
+ test_basic() # build index with 3 docs
52
+ indexer = RSI::Indexer.new( @root )
53
+ indexer.serializer = RSI::YAMLSerializer.new()
54
+ indexer.open()
55
+ assert_finds( indexer )
56
+ end
57
+
58
+ # Assert that the given index contains the right documents/terms
59
+ def assert_finds( indexer )
60
+ a = indexer.find_all( "Bjork" )
61
+ assert_equal( 1, a.length(), "Should have one hit (DOC_C)" )
62
+ assert_equal( "DOC_C", a[0], "Should find DOC_C" )
63
+
64
+ a = indexer.find_all( "bjork" )
65
+ assert_equal( 1, a.length(), "(+i) Should have one hit (DOC_C)" )
66
+ assert_equal( "DOC_C", a[0], "(+i) Should find DOC_C" )
67
+
68
+ a = indexer.find_all( "dog" )
69
+ assert_equal( 2, a.length(), "Should have two hits (DOC_A, DOC_B)" )
70
+ end
71
+
72
+ end
73
+
74
+ if $0 == __FILE__
75
+ RSI::LogManager.instance.root = "."
76
+ RSI::LogManager.instance.log_filename = "t_index.log"
77
+ end
78
+
@@ -0,0 +1,71 @@
1
+ #!/usr/local/bin/ruby -w
2
+ require 'fileutils'
3
+ require 'tmpdir'
4
+ require 'test/unit'
5
+ require 'rsi'
6
+
7
+ module RSI
8
+
9
+ class TestX2SampleAnalyzer
10
+ attr_accessor :base
11
+ def initialize( base=RSI::DefaultTextAnalyzer.new() )
12
+ @base = base
13
+ end
14
+
15
+ def get_field_types()
16
+ return { "text" => RSI::FIELD_TYPE_TEXT,
17
+ "subject" => RSI::FIELD_TYPE_TEXT }
18
+ end
19
+
20
+ def tokenize( content )
21
+ r = {}
22
+ content =~ /^(.*?)\n/
23
+ r['subject'] = @base.tokenize_text( $1 )
24
+ r['text'] = @base.tokenize_text( content )
25
+ return r
26
+ end
27
+
28
+ end
29
+ end
30
+
31
+ class X2SampleAnalyzerTest < Test::Unit::TestCase
32
+
33
+ DOC_A = "The subject was a long and pointless one\n" +
34
+ "\n"+
35
+ "Also, so was the story.\n\n";
36
+
37
+ def setup()
38
+ @tmp = Dir::tmpdir()
39
+ @root = File.join( @tmp, "searchtest.#{$$}.#{rand(65535)}" )
40
+ Dir.mkdir( @root )
41
+ @failed = false
42
+ end
43
+
44
+ def add_failure( msg, bt )
45
+ super
46
+ @failed = true
47
+ end
48
+
49
+ # Delete the temp dir for the index tests
50
+ def teardown()
51
+ if @failed
52
+ print "Test case failed, not cleaning up #@root\n";
53
+ else
54
+ FileUtils::rm_rf( @root )
55
+ end
56
+ end
57
+
58
+ def test_sample()
59
+ indexer = RSI::Indexer.new( @root )
60
+ indexer.serializer = RSI::YAMLSerializer.new()
61
+ indexer.analyzer = RSI::TestX2SampleAnalyzer.new()
62
+ indexer.open()
63
+ indexer.add_document( "DOC_A", DOC_A )
64
+ indexer.flush()
65
+
66
+ # TODO: queryanalyzers
67
+ a = indexer.find_all( "pointless subject" )
68
+ end
69
+
70
+ end
71
+
@@ -0,0 +1 @@
1
+ 0.4
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.4
3
+ specification_version: 1
4
+ name: rsi
5
+ version: !ruby/object:Gem::Version
6
+ version: "0.4"
7
+ date: 2005-02-04
8
+ summary: RSI (Ruby Simple Indexer) is a simple full text index
9
+ require_paths:
10
+ - lib
11
+ email: gdf@rubyforge.org
12
+ homepage: http://rsi.rubyforge.org
13
+ rubyforge_project: rsi
14
+ description: "RSI is a simple full text search engine implementation in Ruby. It aims to be
15
+ easily useful within other programs: simple to set up, simple to use. An
16
+ emphasis has been placed on getting functionality out the door, rather than
17
+ heavy optimization (that can come later). It still appears to be reasonably
18
+ fast and efficient (while admitting to have not been heavily profiled...)."
19
+ autorequire: rsi
20
+ default_executable: rsi_search.rb
21
+ bindir: bin
22
+ has_rdoc: true
23
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
24
+ requirements:
25
+ -
26
+ - ">"
27
+ - !ruby/object:Gem::Version
28
+ version: 0.0.0
29
+ version:
30
+ platform: ruby
31
+ authors:
32
+ - Greg Fast
33
+ files:
34
+ - LICENSE
35
+ - Makefile
36
+ - Manifest
37
+ - README
38
+ - TODO
39
+ - rsi.gemspec
40
+ - setup.rb
41
+ - version.release
42
+ - bin/rsi_search.rb
43
+ - bin/search_bench.rb
44
+ - docs/ATTRIB
45
+ - docs/Changes
46
+ - docs/Roadmap
47
+ - lib/rsi.rb
48
+ - lib/rsi/analysis.rb
49
+ - lib/rsi/compressed_serializers.rb
50
+ - lib/rsi/dictionary.rb
51
+ - lib/rsi/index.rb
52
+ - lib/rsi/logmanager.rb
53
+ - lib/rsi/porter.rb
54
+ - lib/rsi/query.rb
55
+ - lib/rsi/rsi_intro.rb
56
+ - lib/rsi/serializers.rb
57
+ - lib/rsi/stoplist.rb
58
+ - lib/rsi/stoplist.txt
59
+ - tests/suite_all.rb
60
+ - tests/t_analysis.rb
61
+ - tests/t_index.rb
62
+ - tests/t_index_multi.rb
63
+ - tests/t_dictionary.rb
64
+ test_files:
65
+ - tests/suite_all.rb
66
+ rdoc_options: []
67
+ extra_rdoc_files: []
68
+ executables:
69
+ - rsi_search.rb
70
+ extensions: []
71
+ requirements: []
72
+ dependencies: []