rsi 0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +25 -0
- data/Makefile +24 -0
- data/Manifest +30 -0
- data/README +49 -0
- data/TODO +30 -0
- data/bin/rsi_search.rb +50 -0
- data/bin/search_bench.rb +47 -0
- data/docs/ATTRIB +14 -0
- data/docs/Changes +25 -0
- data/docs/Roadmap +41 -0
- data/lib/rsi.rb +40 -0
- data/lib/rsi/analysis.rb +79 -0
- data/lib/rsi/compressed_serializers.rb +60 -0
- data/lib/rsi/dictionary.rb +232 -0
- data/lib/rsi/index.rb +245 -0
- data/lib/rsi/logmanager.rb +105 -0
- data/lib/rsi/porter.rb +213 -0
- data/lib/rsi/query.rb +98 -0
- data/lib/rsi/rsi_intro.rb +91 -0
- data/lib/rsi/serializers.rb +31 -0
- data/lib/rsi/stoplist.rb +72 -0
- data/lib/rsi/stoplist.txt +59 -0
- data/rsi.gemspec +59 -0
- data/setup.rb +1360 -0
- data/tests/suite_all.rb +14 -0
- data/tests/t_analysis.rb +43 -0
- data/tests/t_dictionary.rb +76 -0
- data/tests/t_index.rb +78 -0
- data/tests/t_index_multi.rb +71 -0
- data/version.release +1 -0
- metadata +72 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
|
2
|
+
require 'tmpdir'
|
3
|
+
require 'logger'
|
4
|
+
require 'singleton'
|
5
|
+
|
6
|
+
module RSI
|
7
|
+
|
8
|
+
# Mixin providing a RSI::LogManager-managed #logger() method.
|
9
|
+
# #logger() returns a Logger object.
|
10
|
+
#
|
11
|
+
# class StuffThing
|
12
|
+
# include RSI::Loggable
|
13
|
+
# def do_dealie()
|
14
|
+
# logger.info( "Doing some dealie" )
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# The settings of the logger returned by #logger() can be modified:
|
19
|
+
#
|
20
|
+
# def initialize()
|
21
|
+
# logger.level = Logger.DEBUG # log all messages
|
22
|
+
# logger.debug( "This will show up in the log, now" )
|
23
|
+
# end
|
24
|
+
#
|
25
|
+
module Loggable
|
26
|
+
def logger
|
27
|
+
return RSI::LogManager.instance().logger_for( self )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Trivial extension of Logger, providing it a #write() method.
|
32
|
+
# This allows instances of this logger to be used as the
|
33
|
+
# argument to Logger#new().
|
34
|
+
#
|
35
|
+
# root = XLogger.new( "foo.log" )
|
36
|
+
# other = Logger.new( root )
|
37
|
+
#
|
38
|
+
class XLogger < Logger
|
39
|
+
def write( msg )
|
40
|
+
@logdev.write( msg )
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Manages logger creation for classes which mixin RSI::Loggable.
|
45
|
+
# LogManager has default settings for the log directory (Dir::tmpdir)
|
46
|
+
# and for the log file name ("app.log").
|
47
|
+
#
|
48
|
+
# If you'd like to override the defaults, call #root=()
|
49
|
+
# and/or #log_filename=() before LogManager is first used (ie,
|
50
|
+
# before RSI::Loggable#logger() is called the first time).
|
51
|
+
# You can also supply an arbitrary IO to #root_fh=() .
|
52
|
+
#
|
53
|
+
# By default, the LogManager will create logs with level set
|
54
|
+
# to Logger::INFO. Individual classes mixing in RSI::Loggable may
|
55
|
+
# choose to override this by calling #logger.level=() .
|
56
|
+
#
|
57
|
+
class LogManager
|
58
|
+
include Singleton
|
59
|
+
attr_reader :root_logger
|
60
|
+
attr_accessor :root, :log_filename, :root_fh
|
61
|
+
|
62
|
+
def initialize()
|
63
|
+
@root = Dir::tmpdir
|
64
|
+
@log_filename = "app.log"
|
65
|
+
@root_fh = nil
|
66
|
+
@logger_cache = {}
|
67
|
+
@root_logger = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
# Gets the logger for a class.
|
71
|
+
# Can be passed an object, a Class, or a String.
|
72
|
+
public
|
73
|
+
def logger_for( obj="root" )
|
74
|
+
if obj.kind_of?( String )
|
75
|
+
n = obj
|
76
|
+
elsif obj.kind_of?( Module )
|
77
|
+
n = obj.name
|
78
|
+
else
|
79
|
+
n = obj.class.name
|
80
|
+
end
|
81
|
+
unless @logger_cache.has_key?( n )
|
82
|
+
configure() if @root_logger.nil?
|
83
|
+
@logger_cache[n] = Logger.new( @root_logger )
|
84
|
+
@logger_cache[n].progname = n
|
85
|
+
@logger_cache[n].level = Logger::INFO
|
86
|
+
end
|
87
|
+
return @logger_cache[n]
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
def configure()
|
92
|
+
if @root_fh.nil?
|
93
|
+
@root_fh = File.open( File.join(@root, @log_filename),
|
94
|
+
File::WRONLY | File::APPEND | File::CREAT )
|
95
|
+
@root_fh.sync = true
|
96
|
+
end
|
97
|
+
@root_logger = XLogger.new( @root_fh )
|
98
|
+
@root_logger.progname = "root"
|
99
|
+
#setting @root_logger.level seems to screw things up
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
data/lib/rsi/porter.rb
ADDED
@@ -0,0 +1,213 @@
|
|
1
|
+
#! /local/ruby/bin/ruby
|
2
|
+
#
|
3
|
+
# $Id: porter.rb 37 2005-01-13 04:23:07Z gdf $
|
4
|
+
#
|
5
|
+
# See example usage at the end of this file.
|
6
|
+
#
|
7
|
+
|
8
|
+
module Stemmable
|
9
|
+
|
10
|
+
STEP_2_LIST = {
|
11
|
+
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
12
|
+
'izer'=>'ize', 'bli'=>'ble',
|
13
|
+
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
14
|
+
'ization'=>'ize', 'ation'=>'ate',
|
15
|
+
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
16
|
+
'ousness'=>'ous', 'aliti'=>'al',
|
17
|
+
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
18
|
+
}
|
19
|
+
|
20
|
+
STEP_3_LIST = {
|
21
|
+
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
22
|
+
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
SUFFIX_1_REGEXP = /(
|
27
|
+
ational |
|
28
|
+
tional |
|
29
|
+
enci |
|
30
|
+
anci |
|
31
|
+
izer |
|
32
|
+
bli |
|
33
|
+
alli |
|
34
|
+
entli |
|
35
|
+
eli |
|
36
|
+
ousli |
|
37
|
+
ization |
|
38
|
+
ation |
|
39
|
+
ator |
|
40
|
+
alism |
|
41
|
+
iveness |
|
42
|
+
fulness |
|
43
|
+
ousness |
|
44
|
+
aliti |
|
45
|
+
iviti |
|
46
|
+
biliti |
|
47
|
+
logi)$/x
|
48
|
+
|
49
|
+
|
50
|
+
SUFFIX_2_REGEXP = /(
|
51
|
+
al |
|
52
|
+
ance |
|
53
|
+
ence |
|
54
|
+
er |
|
55
|
+
ic |
|
56
|
+
able |
|
57
|
+
ible |
|
58
|
+
ant |
|
59
|
+
ement |
|
60
|
+
ment |
|
61
|
+
ent |
|
62
|
+
ou |
|
63
|
+
ism |
|
64
|
+
ate |
|
65
|
+
iti |
|
66
|
+
ous |
|
67
|
+
ive |
|
68
|
+
ize)$/x
|
69
|
+
|
70
|
+
|
71
|
+
C = "[^aeiou]" # consonant
|
72
|
+
V = "[aeiouy]" # vowel
|
73
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
74
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
75
|
+
|
76
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
77
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
78
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
79
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
80
|
+
|
81
|
+
#
|
82
|
+
# Porter stemmer in Ruby.
|
83
|
+
#
|
84
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
85
|
+
# version coded up in Perl. It's easy to follow against the rules
|
86
|
+
# in the original paper in:
|
87
|
+
#
|
88
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
89
|
+
# no. 3, pp 130-137,
|
90
|
+
#
|
91
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
92
|
+
#
|
93
|
+
# Send comments to raypereda@hotmail.com
|
94
|
+
#
|
95
|
+
|
96
|
+
def stem_porter
|
97
|
+
|
98
|
+
# make a copy of the given object and convert it to a string.
|
99
|
+
w = self.dup.to_str
|
100
|
+
|
101
|
+
return w if w.length < 3
|
102
|
+
|
103
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
104
|
+
w[0] = 'Y' if w[0] == ?y
|
105
|
+
|
106
|
+
# Step 1a
|
107
|
+
if w =~ /(ss|i)es$/
|
108
|
+
w = $` + $1
|
109
|
+
elsif w =~ /([^s])s$/
|
110
|
+
w = $` + $1
|
111
|
+
end
|
112
|
+
|
113
|
+
# Step 1b
|
114
|
+
if w =~ /eed$/
|
115
|
+
w.chop! if $` =~ MGR0
|
116
|
+
elsif w =~ /(ed|ing)$/
|
117
|
+
stem = $`
|
118
|
+
if stem =~ VOWEL_IN_STEM
|
119
|
+
w = stem
|
120
|
+
case w
|
121
|
+
when /(at|bl|iz)$/ then w << "e"
|
122
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
123
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
if w =~ /y$/
|
129
|
+
stem = $`
|
130
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
131
|
+
end
|
132
|
+
|
133
|
+
# Step 2
|
134
|
+
if w =~ SUFFIX_1_REGEXP
|
135
|
+
stem = $`
|
136
|
+
suffix = $1
|
137
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
138
|
+
if stem =~ MGR0
|
139
|
+
w = stem + STEP_2_LIST[suffix]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Step 3
|
144
|
+
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
145
|
+
stem = $`
|
146
|
+
suffix = $1
|
147
|
+
if stem =~ MGR0
|
148
|
+
w = stem + STEP_3_LIST[suffix]
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# Step 4
|
153
|
+
if w =~ SUFFIX_2_REGEXP
|
154
|
+
stem = $`
|
155
|
+
if stem =~ MGR1
|
156
|
+
w = stem
|
157
|
+
end
|
158
|
+
elsif w =~ /(s|t)(ion)$/
|
159
|
+
stem = $` + $1
|
160
|
+
if stem =~ MGR1
|
161
|
+
w = stem
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Step 5
|
166
|
+
if w =~ /e$/
|
167
|
+
stem = $`
|
168
|
+
if (stem =~ MGR1) ||
|
169
|
+
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
170
|
+
w = stem
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
if w =~ /ll$/ && w =~ MGR1
|
175
|
+
w.chop!
|
176
|
+
end
|
177
|
+
|
178
|
+
# and turn initial Y back to y
|
179
|
+
w[0] = 'y' if w[0] == ?Y
|
180
|
+
|
181
|
+
w
|
182
|
+
end
|
183
|
+
|
184
|
+
|
185
|
+
#
|
186
|
+
# make the stem_porter the default stem method, just in case we
|
187
|
+
# feel like having multiple stemmers available later.
|
188
|
+
#
|
189
|
+
alias stem stem_porter
|
190
|
+
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
#
|
196
|
+
# Make this script executable, and send it words on stdin, one per
|
197
|
+
# line, and it will output the stemmed versions to stdout.
|
198
|
+
#
|
199
|
+
if $0 == __FILE__ then
|
200
|
+
class String
|
201
|
+
include Stemmable
|
202
|
+
end
|
203
|
+
|
204
|
+
# the String class, and any subclasses of it you might have, now know
|
205
|
+
# how to stem things.
|
206
|
+
|
207
|
+
$stdin.each do |word|
|
208
|
+
puts word.stem
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
|
data/lib/rsi/query.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
|
2
|
+
require 'rsi/logmanager'
|
3
|
+
|
4
|
+
module RSI
|
5
|
+
|
6
|
+
class Query
|
7
|
+
include Loggable
|
8
|
+
|
9
|
+
def initialize()
|
10
|
+
@subqueries = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_subquery( query )
|
14
|
+
@subqueries << query
|
15
|
+
end
|
16
|
+
|
17
|
+
def evaluate( locator ); end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
class ANDQuery < Query
|
22
|
+
def evaluate( locator )
|
23
|
+
ret_set = nil
|
24
|
+
@subqueries.each do |q|
|
25
|
+
set = q.evaluate( locator )
|
26
|
+
if ret_set.nil?
|
27
|
+
ret_set = set
|
28
|
+
else
|
29
|
+
ret_set = ret_set & set
|
30
|
+
end
|
31
|
+
# short-circuit bottoming out
|
32
|
+
if ret_set.size()==0
|
33
|
+
return ret_set
|
34
|
+
end
|
35
|
+
end
|
36
|
+
return ret_set
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_s
|
40
|
+
return "( " + @subqueries.join(" AND ") + " )";
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
class ORQuery < Query
|
45
|
+
def evaluate()
|
46
|
+
ret_set = []
|
47
|
+
@subqueries.each do |q|
|
48
|
+
ret_set = ret_set | q.evaluate( locator )
|
49
|
+
end
|
50
|
+
return ret_set
|
51
|
+
end
|
52
|
+
|
53
|
+
def to_s
|
54
|
+
return "( " + @subqueries.join(" OR ") + " )";
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class TermQuery < Query
|
59
|
+
attr_accessor :field, :term
|
60
|
+
def initialize( field, term )
|
61
|
+
@field = field
|
62
|
+
@term = term
|
63
|
+
end
|
64
|
+
def evaluate( locator )
|
65
|
+
logger.debug( "Getting dict for #@field" )
|
66
|
+
dict = locator.get_dict_for_field( @field )
|
67
|
+
# get all docids containing @field:@term -> []
|
68
|
+
# return set
|
69
|
+
unless dict.has_term?( term )
|
70
|
+
logger.debug( "Dict has no such term #{term}" )
|
71
|
+
return []
|
72
|
+
else
|
73
|
+
ret = []
|
74
|
+
termid = dict.get_termid_for( term )
|
75
|
+
logger.debug( "Getting entries for #{term}(#{termid})" )
|
76
|
+
dict.get_entry_list( termid ).each do |termentry|
|
77
|
+
logger.debug( termentry.to_s )
|
78
|
+
ret << termentry.docid
|
79
|
+
end
|
80
|
+
return ret.uniq
|
81
|
+
end
|
82
|
+
end
|
83
|
+
def to_s
|
84
|
+
return "#@field='#@term'"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
##; def analyze_query( q_str )
|
89
|
+
##; # (a OR b) AND (c OR d)
|
90
|
+
##; # -> AND[ OR[a,b], OR[c,d] ]
|
91
|
+
##; # split on whitespace
|
92
|
+
##; # split x:foo
|
93
|
+
##; # tokenize foo
|
94
|
+
##; # add another AND termquery
|
95
|
+
##;
|
96
|
+
##; end
|
97
|
+
|
98
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#
|
2
|
+
# = RSI (Ruby Simple Indexer)
|
3
|
+
#
|
4
|
+
# RSI is a simple full text search engine implementation in Ruby. It
|
5
|
+
# aims to be easily useful within other programs: simple to set up,
|
6
|
+
# simple to use.
|
7
|
+
#
|
8
|
+
# An emphasis has been placed on getting functionality out the door,
|
9
|
+
# rather than heavy optimization (that can come later). It still
|
10
|
+
# appears to be reasonably fast and efficient (while admitting to have
|
11
|
+
# not been heavily profiled...).
|
12
|
+
#
|
13
|
+
# == Getting RSI
|
14
|
+
#
|
15
|
+
# RSI can be downloaded from Rubyforge (http://rubyforge.org/projects/rsi/).
|
16
|
+
#
|
17
|
+
# == Using RSI
|
18
|
+
#
|
19
|
+
# Creating an index:
|
20
|
+
#
|
21
|
+
# require 'rsi'
|
22
|
+
# indexer = RSI::Index.new( "/path/to/index" )
|
23
|
+
# Dir.foreach( "~/words" ) do |textfile|
|
24
|
+
# indexer.add_document( textfile, File.read("~/words/#{textfile}") )
|
25
|
+
# end
|
26
|
+
# indexer.flush()
|
27
|
+
#
|
28
|
+
# By default, the RSI indexer assumes that documents fed to it are plain
|
29
|
+
# text docs (more complex analyzers should appear in future releases).
|
30
|
+
#
|
31
|
+
# Searching an index:
|
32
|
+
#
|
33
|
+
# require 'rsi'
|
34
|
+
# indexer = RSI::Index.new( "/path/to/index" )
|
35
|
+
# puts indexer.find_all( "some three terms" )
|
36
|
+
#
|
37
|
+
# == Advanced Usage
|
38
|
+
#
|
39
|
+
# (Tweakability will be enhanced in future releases.)
|
40
|
+
#
|
41
|
+
# require 'rsi'
|
42
|
+
#
|
43
|
+
# indexer = RSI::Indexer.new( "/data/search" )
|
44
|
+
# indexer.serializer = RSI::NativeSerializer.new()
|
45
|
+
# indexer.analyzer = RSI::DefaultTextAnalyzer.new()
|
46
|
+
# indexer.query_analyzer = RSI::DefaultTextAnalyzer.new()
|
47
|
+
#
|
48
|
+
# === Changing the dictionary serializer
|
49
|
+
#
|
50
|
+
# The dictionary's serializer controls how the index database is
|
51
|
+
# stored. By default, RSI uses Ruby's Marshal to store the database
|
52
|
+
# objects. These serializers are also available:
|
53
|
+
#
|
54
|
+
# * RSI::NativeSerializer - default, uses Ruby's built-in Marshal lib.
|
55
|
+
#
|
56
|
+
# * RSI::YAMLSerializer - serializes DB objects as YAML. Excellent for
|
57
|
+
# debugging purposes. Very slow compared to NativeSerializer.
|
58
|
+
#
|
59
|
+
# * RSI::CompressedSerializer - uses Marshall (by default), plus
|
60
|
+
# compresses the output with bzip. The speed penalty is probably not
|
61
|
+
# worth the space savings (at least the way the db is currently
|
62
|
+
# implemented). Also requires the `bz2` library.
|
63
|
+
#
|
64
|
+
# Naturally, if you create an index with a give serializer, you will
|
65
|
+
# need to re-open the index with that same serializer. (This should be
|
66
|
+
# auto-detected in future releases.)
|
67
|
+
#
|
68
|
+
# === Changing the analyzer
|
69
|
+
#
|
70
|
+
# The analyzer is used both to tokenize documents into indexable
|
71
|
+
# terms. The default analyzer splits on whitespace and performs some
|
72
|
+
# normalization (stemming, stopword removal, etc).
|
73
|
+
#
|
74
|
+
# The query analyzer is used to tokenize query terms.
|
75
|
+
#
|
76
|
+
# Currently there are no other analyzers available (see Roadmap).
|
77
|
+
#
|
78
|
+
# === Changing the stoplist
|
79
|
+
#
|
80
|
+
# The default stoplist is pretty minimal (see stoplist.rb).
|
81
|
+
#
|
82
|
+
# (should be easier: see Development Roadmap)
|
83
|
+
#
|
84
|
+
# class MyAnalyzer < RSI::Analyzer
|
85
|
+
# def initialize_stoplist()
|
86
|
+
# return unless @stoplist.nil?
|
87
|
+
# @stoplist = { 'THE' => 1, ... }
|
88
|
+
# end
|
89
|
+
# end
|
90
|
+
#
|
91
|
+
module RSI; end
|