rsi 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +25 -0
- data/Makefile +24 -0
- data/Manifest +30 -0
- data/README +49 -0
- data/TODO +30 -0
- data/bin/rsi_search.rb +50 -0
- data/bin/search_bench.rb +47 -0
- data/docs/ATTRIB +14 -0
- data/docs/Changes +25 -0
- data/docs/Roadmap +41 -0
- data/lib/rsi.rb +40 -0
- data/lib/rsi/analysis.rb +79 -0
- data/lib/rsi/compressed_serializers.rb +60 -0
- data/lib/rsi/dictionary.rb +232 -0
- data/lib/rsi/index.rb +245 -0
- data/lib/rsi/logmanager.rb +105 -0
- data/lib/rsi/porter.rb +213 -0
- data/lib/rsi/query.rb +98 -0
- data/lib/rsi/rsi_intro.rb +91 -0
- data/lib/rsi/serializers.rb +31 -0
- data/lib/rsi/stoplist.rb +72 -0
- data/lib/rsi/stoplist.txt +59 -0
- data/rsi.gemspec +59 -0
- data/setup.rb +1360 -0
- data/tests/suite_all.rb +14 -0
- data/tests/t_analysis.rb +43 -0
- data/tests/t_dictionary.rb +76 -0
- data/tests/t_index.rb +78 -0
- data/tests/t_index_multi.rb +71 -0
- data/version.release +1 -0
- metadata +72 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
|
2
|
+
require 'tmpdir'
|
3
|
+
require 'logger'
|
4
|
+
require 'singleton'
|
5
|
+
|
6
|
+
module RSI
|
7
|
+
|
8
|
+
# Mixin providing a RSI::LogManager-managed #logger() method.
|
9
|
+
# #logger() returns a Logger object.
|
10
|
+
#
|
11
|
+
# class StuffThing
|
12
|
+
# include RSI::Loggable
|
13
|
+
# def do_dealie()
|
14
|
+
# logger.info( "Doing some dealie" )
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# The settings of the logger returned by #logger() can be modified:
|
19
|
+
#
|
20
|
+
# def initialize()
|
21
|
+
# logger.level = Logger.DEBUG # log all messages
|
22
|
+
# logger.debug( "This will show up in the log, now" )
|
23
|
+
# end
|
24
|
+
#
|
25
|
+
module Loggable
|
26
|
+
def logger
|
27
|
+
return RSI::LogManager.instance().logger_for( self )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Trivial extension of Logger, providing it a #write() method.
|
32
|
+
# This allows instances of this logger to be used as the
|
33
|
+
# argument to Logger#new().
|
34
|
+
#
|
35
|
+
# root = XLogger.new( "foo.log" )
|
36
|
+
# other = Logger.new( root )
|
37
|
+
#
|
38
|
+
class XLogger < Logger
|
39
|
+
def write( msg )
|
40
|
+
@logdev.write( msg )
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Manages logger creation for classes which mixin RSI::Loggable.
|
45
|
+
# LogManager has default settings for the log directory (Dir::tmpdir)
|
46
|
+
# and for the log file name ("app.log").
|
47
|
+
#
|
48
|
+
# If you'd like to override the defaults, call #root=()
|
49
|
+
# and/or #log_filename=() before LogManager is first used (ie,
|
50
|
+
# before RSI::Loggable#logger() is called the first time).
|
51
|
+
# You can also supply an arbitrary IO to #root_fh=() .
|
52
|
+
#
|
53
|
+
# By default, the LogManager will create logs with level set
|
54
|
+
# to Logger::INFO. Individual classes mixing in RSI::Loggable may
|
55
|
+
# choose to override this by calling #logger.level=() .
|
56
|
+
#
|
57
|
+
class LogManager
|
58
|
+
include Singleton
|
59
|
+
attr_reader :root_logger
|
60
|
+
attr_accessor :root, :log_filename, :root_fh
|
61
|
+
|
62
|
+
def initialize()
|
63
|
+
@root = Dir::tmpdir
|
64
|
+
@log_filename = "app.log"
|
65
|
+
@root_fh = nil
|
66
|
+
@logger_cache = {}
|
67
|
+
@root_logger = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
# Gets the logger for a class.
|
71
|
+
# Can be passed an object, a Class, or a String.
|
72
|
+
public
|
73
|
+
def logger_for( obj="root" )
|
74
|
+
if obj.kind_of?( String )
|
75
|
+
n = obj
|
76
|
+
elsif obj.kind_of?( Module )
|
77
|
+
n = obj.name
|
78
|
+
else
|
79
|
+
n = obj.class.name
|
80
|
+
end
|
81
|
+
unless @logger_cache.has_key?( n )
|
82
|
+
configure() if @root_logger.nil?
|
83
|
+
@logger_cache[n] = Logger.new( @root_logger )
|
84
|
+
@logger_cache[n].progname = n
|
85
|
+
@logger_cache[n].level = Logger::INFO
|
86
|
+
end
|
87
|
+
return @logger_cache[n]
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
def configure()
|
92
|
+
if @root_fh.nil?
|
93
|
+
@root_fh = File.open( File.join(@root, @log_filename),
|
94
|
+
File::WRONLY | File::APPEND | File::CREAT )
|
95
|
+
@root_fh.sync = true
|
96
|
+
end
|
97
|
+
@root_logger = XLogger.new( @root_fh )
|
98
|
+
@root_logger.progname = "root"
|
99
|
+
#setting @root_logger.level seems to screw things up
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
data/lib/rsi/porter.rb
ADDED
@@ -0,0 +1,213 @@
|
|
1
|
+
#! /local/ruby/bin/ruby
|
2
|
+
#
|
3
|
+
# $Id: porter.rb 37 2005-01-13 04:23:07Z gdf $
|
4
|
+
#
|
5
|
+
# See example usage at the end of this file.
|
6
|
+
#
|
7
|
+
|
8
|
+
module Stemmable
|
9
|
+
|
10
|
+
STEP_2_LIST = {
|
11
|
+
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
12
|
+
'izer'=>'ize', 'bli'=>'ble',
|
13
|
+
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
14
|
+
'ization'=>'ize', 'ation'=>'ate',
|
15
|
+
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
16
|
+
'ousness'=>'ous', 'aliti'=>'al',
|
17
|
+
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
18
|
+
}
|
19
|
+
|
20
|
+
STEP_3_LIST = {
|
21
|
+
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
22
|
+
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
SUFFIX_1_REGEXP = /(
|
27
|
+
ational |
|
28
|
+
tional |
|
29
|
+
enci |
|
30
|
+
anci |
|
31
|
+
izer |
|
32
|
+
bli |
|
33
|
+
alli |
|
34
|
+
entli |
|
35
|
+
eli |
|
36
|
+
ousli |
|
37
|
+
ization |
|
38
|
+
ation |
|
39
|
+
ator |
|
40
|
+
alism |
|
41
|
+
iveness |
|
42
|
+
fulness |
|
43
|
+
ousness |
|
44
|
+
aliti |
|
45
|
+
iviti |
|
46
|
+
biliti |
|
47
|
+
logi)$/x
|
48
|
+
|
49
|
+
|
50
|
+
SUFFIX_2_REGEXP = /(
|
51
|
+
al |
|
52
|
+
ance |
|
53
|
+
ence |
|
54
|
+
er |
|
55
|
+
ic |
|
56
|
+
able |
|
57
|
+
ible |
|
58
|
+
ant |
|
59
|
+
ement |
|
60
|
+
ment |
|
61
|
+
ent |
|
62
|
+
ou |
|
63
|
+
ism |
|
64
|
+
ate |
|
65
|
+
iti |
|
66
|
+
ous |
|
67
|
+
ive |
|
68
|
+
ize)$/x
|
69
|
+
|
70
|
+
|
71
|
+
C = "[^aeiou]" # consonant
|
72
|
+
V = "[aeiouy]" # vowel
|
73
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
74
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
75
|
+
|
76
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
77
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
78
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
79
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
80
|
+
|
81
|
+
#
|
82
|
+
# Porter stemmer in Ruby.
|
83
|
+
#
|
84
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
85
|
+
# version coded up in Perl. It's easy to follow against the rules
|
86
|
+
# in the original paper in:
|
87
|
+
#
|
88
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
89
|
+
# no. 3, pp 130-137,
|
90
|
+
#
|
91
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
92
|
+
#
|
93
|
+
# Send comments to raypereda@hotmail.com
|
94
|
+
#
|
95
|
+
|
96
|
+
def stem_porter
|
97
|
+
|
98
|
+
# make a copy of the given object and convert it to a string.
|
99
|
+
w = self.dup.to_str
|
100
|
+
|
101
|
+
return w if w.length < 3
|
102
|
+
|
103
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
104
|
+
w[0] = 'Y' if w[0] == ?y
|
105
|
+
|
106
|
+
# Step 1a
|
107
|
+
if w =~ /(ss|i)es$/
|
108
|
+
w = $` + $1
|
109
|
+
elsif w =~ /([^s])s$/
|
110
|
+
w = $` + $1
|
111
|
+
end
|
112
|
+
|
113
|
+
# Step 1b
|
114
|
+
if w =~ /eed$/
|
115
|
+
w.chop! if $` =~ MGR0
|
116
|
+
elsif w =~ /(ed|ing)$/
|
117
|
+
stem = $`
|
118
|
+
if stem =~ VOWEL_IN_STEM
|
119
|
+
w = stem
|
120
|
+
case w
|
121
|
+
when /(at|bl|iz)$/ then w << "e"
|
122
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
123
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
if w =~ /y$/
|
129
|
+
stem = $`
|
130
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
131
|
+
end
|
132
|
+
|
133
|
+
# Step 2
|
134
|
+
if w =~ SUFFIX_1_REGEXP
|
135
|
+
stem = $`
|
136
|
+
suffix = $1
|
137
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
138
|
+
if stem =~ MGR0
|
139
|
+
w = stem + STEP_2_LIST[suffix]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Step 3
|
144
|
+
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
145
|
+
stem = $`
|
146
|
+
suffix = $1
|
147
|
+
if stem =~ MGR0
|
148
|
+
w = stem + STEP_3_LIST[suffix]
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# Step 4
|
153
|
+
if w =~ SUFFIX_2_REGEXP
|
154
|
+
stem = $`
|
155
|
+
if stem =~ MGR1
|
156
|
+
w = stem
|
157
|
+
end
|
158
|
+
elsif w =~ /(s|t)(ion)$/
|
159
|
+
stem = $` + $1
|
160
|
+
if stem =~ MGR1
|
161
|
+
w = stem
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Step 5
|
166
|
+
if w =~ /e$/
|
167
|
+
stem = $`
|
168
|
+
if (stem =~ MGR1) ||
|
169
|
+
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
170
|
+
w = stem
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
if w =~ /ll$/ && w =~ MGR1
|
175
|
+
w.chop!
|
176
|
+
end
|
177
|
+
|
178
|
+
# and turn initial Y back to y
|
179
|
+
w[0] = 'y' if w[0] == ?Y
|
180
|
+
|
181
|
+
w
|
182
|
+
end
|
183
|
+
|
184
|
+
|
185
|
+
#
|
186
|
+
# make the stem_porter the default stem method, just in case we
|
187
|
+
# feel like having multiple stemmers available later.
|
188
|
+
#
|
189
|
+
alias stem stem_porter
|
190
|
+
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
#
|
196
|
+
# Make this script executable, and send it words on stdin, one per
|
197
|
+
# line, and it will output the stemmed versions to stdout.
|
198
|
+
#
|
199
|
+
if $0 == __FILE__ then
|
200
|
+
class String
|
201
|
+
include Stemmable
|
202
|
+
end
|
203
|
+
|
204
|
+
# the String class, and any subclasses of it you might have, now know
|
205
|
+
# how to stem things.
|
206
|
+
|
207
|
+
$stdin.each do |word|
|
208
|
+
puts word.stem
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
|
data/lib/rsi/query.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
|
2
|
+
require 'rsi/logmanager'
|
3
|
+
|
4
|
+
module RSI
|
5
|
+
|
6
|
+
class Query
|
7
|
+
include Loggable
|
8
|
+
|
9
|
+
def initialize()
|
10
|
+
@subqueries = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_subquery( query )
|
14
|
+
@subqueries << query
|
15
|
+
end
|
16
|
+
|
17
|
+
def evaluate( locator ); end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
class ANDQuery < Query
|
22
|
+
def evaluate( locator )
|
23
|
+
ret_set = nil
|
24
|
+
@subqueries.each do |q|
|
25
|
+
set = q.evaluate( locator )
|
26
|
+
if ret_set.nil?
|
27
|
+
ret_set = set
|
28
|
+
else
|
29
|
+
ret_set = ret_set & set
|
30
|
+
end
|
31
|
+
# short-circuit bottoming out
|
32
|
+
if ret_set.size()==0
|
33
|
+
return ret_set
|
34
|
+
end
|
35
|
+
end
|
36
|
+
return ret_set
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_s
|
40
|
+
return "( " + @subqueries.join(" AND ") + " )";
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
class ORQuery < Query
|
45
|
+
def evaluate()
|
46
|
+
ret_set = []
|
47
|
+
@subqueries.each do |q|
|
48
|
+
ret_set = ret_set | q.evaluate( locator )
|
49
|
+
end
|
50
|
+
return ret_set
|
51
|
+
end
|
52
|
+
|
53
|
+
def to_s
|
54
|
+
return "( " + @subqueries.join(" OR ") + " )";
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class TermQuery < Query
|
59
|
+
attr_accessor :field, :term
|
60
|
+
def initialize( field, term )
|
61
|
+
@field = field
|
62
|
+
@term = term
|
63
|
+
end
|
64
|
+
def evaluate( locator )
|
65
|
+
logger.debug( "Getting dict for #@field" )
|
66
|
+
dict = locator.get_dict_for_field( @field )
|
67
|
+
# get all docids containing @field:@term -> []
|
68
|
+
# return set
|
69
|
+
unless dict.has_term?( term )
|
70
|
+
logger.debug( "Dict has no such term #{term}" )
|
71
|
+
return []
|
72
|
+
else
|
73
|
+
ret = []
|
74
|
+
termid = dict.get_termid_for( term )
|
75
|
+
logger.debug( "Getting entries for #{term}(#{termid})" )
|
76
|
+
dict.get_entry_list( termid ).each do |termentry|
|
77
|
+
logger.debug( termentry.to_s )
|
78
|
+
ret << termentry.docid
|
79
|
+
end
|
80
|
+
return ret.uniq
|
81
|
+
end
|
82
|
+
end
|
83
|
+
def to_s
|
84
|
+
return "#@field='#@term'"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
##; def analyze_query( q_str )
|
89
|
+
##; # (a OR b) AND (c OR d)
|
90
|
+
##; # -> AND[ OR[a,b], OR[c,d] ]
|
91
|
+
##; # split on whitespace
|
92
|
+
##; # split x:foo
|
93
|
+
##; # tokenize foo
|
94
|
+
##; # add another AND termquery
|
95
|
+
##;
|
96
|
+
##; end
|
97
|
+
|
98
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#
|
2
|
+
# = RSI (Ruby Simple Indexer)
|
3
|
+
#
|
4
|
+
# RSI is a simple full text search engine implementation in Ruby. It
|
5
|
+
# aims to be easily useful within other programs: simple to set up,
|
6
|
+
# simple to use.
|
7
|
+
#
|
8
|
+
# An emphasis has been placed on getting functionality out the door,
|
9
|
+
# rather than heavy optimization (that can come later). It still
|
10
|
+
# appears to be reasonably fast and efficient (while admitting to have
|
11
|
+
# not been heavily profiled...).
|
12
|
+
#
|
13
|
+
# == Getting RSI
|
14
|
+
#
|
15
|
+
# RSI can be downloaded from Rubyforge (http://rubyforge.org/projects/rsi/).
|
16
|
+
#
|
17
|
+
# == Using RSI
|
18
|
+
#
|
19
|
+
# Creating an index:
|
20
|
+
#
|
21
|
+
# require 'rsi'
|
22
|
+
# indexer = RSI::Index.new( "/path/to/index" )
|
23
|
+
# Dir.foreach( "~/words" ) do |textfile|
|
24
|
+
# indexer.add_document( textfile, File.read("~/words/#{textfile}") )
|
25
|
+
# end
|
26
|
+
# indexer.flush()
|
27
|
+
#
|
28
|
+
# By default, the RSI indexer assumes that documents fed to it are plain
|
29
|
+
# text docs (more complex analyzers should appear in future releases).
|
30
|
+
#
|
31
|
+
# Searching an index:
|
32
|
+
#
|
33
|
+
# require 'rsi'
|
34
|
+
# indexer = RSI::Index.new( "/path/to/index" )
|
35
|
+
# puts indexer.find_all( "some three terms" )
|
36
|
+
#
|
37
|
+
# == Advanced Usage
|
38
|
+
#
|
39
|
+
# (Tweakability will be enhanced in future releases.)
|
40
|
+
#
|
41
|
+
# require 'rsi'
|
42
|
+
#
|
43
|
+
# indexer = RSI::Indexer.new( "/data/search" )
|
44
|
+
# indexer.serializer = RSI::NativeSerializer.new()
|
45
|
+
# indexer.analyzer = RSI::DefaultTextAnalyzer.new()
|
46
|
+
# indexer.query_analyzer = RSI::DefaultTextAnalyzer.new()
|
47
|
+
#
|
48
|
+
# === Changing the dictionary serializer
|
49
|
+
#
|
50
|
+
# The dictionary's serializer controls how the index database is
|
51
|
+
# stored. By default, RSI uses Ruby's Marshal to store the database
|
52
|
+
# objects. These serializers are also available:
|
53
|
+
#
|
54
|
+
# * RSI::NativeSerializer - default, uses Ruby's built-in Marshal lib.
|
55
|
+
#
|
56
|
+
# * RSI::YAMLSerializer - serializes DB objects as YAML. Excellent for
|
57
|
+
# debugging purposes. Very slow compared to NativeSerializer.
|
58
|
+
#
|
59
|
+
# * RSI::CompressedSerializer - uses Marshall (by default), plus
|
60
|
+
# compresses the output with bzip. The speed penalty is probably not
|
61
|
+
# worth the space savings (at least the way the db is currently
|
62
|
+
# implemented). Also requires the `bz2` library.
|
63
|
+
#
|
64
|
+
# Naturally, if you create an index with a give serializer, you will
|
65
|
+
# need to re-open the index with that same serializer. (This should be
|
66
|
+
# auto-detected in future releases.)
|
67
|
+
#
|
68
|
+
# === Changing the analyzer
|
69
|
+
#
|
70
|
+
# The analyzer is used both to tokenize documents into indexable
|
71
|
+
# terms. The default analyzer splits on whitespace and performs some
|
72
|
+
# normalization (stemming, stopword removal, etc).
|
73
|
+
#
|
74
|
+
# The query analyzer is used to tokenize query terms.
|
75
|
+
#
|
76
|
+
# Currently there are no other analyzers available (see Roadmap).
|
77
|
+
#
|
78
|
+
# === Changing the stoplist
|
79
|
+
#
|
80
|
+
# The default stoplist is pretty minimal (see stoplist.rb).
|
81
|
+
#
|
82
|
+
# (should be easier: see Development Roadmap)
|
83
|
+
#
|
84
|
+
# class MyAnalyzer < RSI::Analyzer
|
85
|
+
# def initialize_stoplist()
|
86
|
+
# return unless @stoplist.nil?
|
87
|
+
# @stoplist = { 'THE' => 1, ... }
|
88
|
+
# end
|
89
|
+
# end
|
90
|
+
#
|
91
|
+
module RSI; end
|