rsi 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+
2
+ require 'tmpdir'
3
+ require 'logger'
4
+ require 'singleton'
5
+
6
+ module RSI
7
+
8
+ # Mixin providing a RSI::LogManager-managed #logger() method.
9
+ # #logger() returns a Logger object.
10
+ #
11
+ # class StuffThing
12
+ # include RSI::Loggable
13
+ # def do_dealie()
14
+ # logger.info( "Doing some dealie" )
15
+ # end
16
+ # end
17
+ #
18
+ # The settings of the logger returned by #logger() can be modified:
19
+ #
20
+ # def initialize()
21
+ # logger.level = Logger.DEBUG # log all messages
22
+ # logger.debug( "This will show up in the log, now" )
23
+ # end
24
+ #
25
+ module Loggable
26
+ def logger
27
+ return RSI::LogManager.instance().logger_for( self )
28
+ end
29
+ end
30
+
31
+ # Trivial extension of Logger, providing it a #write() method.
32
+ # This allows instances of this logger to be used as the
33
+ # argument to Logger#new().
34
+ #
35
+ # root = XLogger.new( "foo.log" )
36
+ # other = Logger.new( root )
37
+ #
38
+ class XLogger < Logger
39
+ def write( msg )
40
+ @logdev.write( msg )
41
+ end
42
+ end
43
+
44
+ # Manages logger creation for classes which mixin RSI::Loggable.
45
+ # LogManager has default settings for the log directory (Dir::tmpdir)
46
+ # and for the log file name ("app.log").
47
+ #
48
+ # If you'd like to override the defaults, call #root=()
49
+ # and/or #log_filename=() before LogManager is first used (ie,
50
+ # before RSI::Loggable#logger() is called the first time).
51
+ # You can also supply an arbitrary IO to #root_fh=() .
52
+ #
53
+ # By default, the LogManager will create logs with level set
54
+ # to Logger::INFO. Individual classes mixing in RSI::Loggable may
55
+ # choose to override this by calling #logger.level=() .
56
+ #
57
+ class LogManager
58
+ include Singleton
59
+ attr_reader :root_logger
60
+ attr_accessor :root, :log_filename, :root_fh
61
+
62
+ def initialize()
63
+ @root = Dir::tmpdir
64
+ @log_filename = "app.log"
65
+ @root_fh = nil
66
+ @logger_cache = {}
67
+ @root_logger = nil
68
+ end
69
+
70
+ # Gets the logger for a class.
71
+ # Can be passed an object, a Class, or a String.
72
+ public
73
+ def logger_for( obj="root" )
74
+ if obj.kind_of?( String )
75
+ n = obj
76
+ elsif obj.kind_of?( Module )
77
+ n = obj.name
78
+ else
79
+ n = obj.class.name
80
+ end
81
+ unless @logger_cache.has_key?( n )
82
+ configure() if @root_logger.nil?
83
+ @logger_cache[n] = Logger.new( @root_logger )
84
+ @logger_cache[n].progname = n
85
+ @logger_cache[n].level = Logger::INFO
86
+ end
87
+ return @logger_cache[n]
88
+ end
89
+
90
+ private
91
+ def configure()
92
+ if @root_fh.nil?
93
+ @root_fh = File.open( File.join(@root, @log_filename),
94
+ File::WRONLY | File::APPEND | File::CREAT )
95
+ @root_fh.sync = true
96
+ end
97
+ @root_logger = XLogger.new( @root_fh )
98
+ @root_logger.progname = "root"
99
+ #setting @root_logger.level seems to screw things up
100
+ end
101
+
102
+ end
103
+ end
104
+
105
+
@@ -0,0 +1,213 @@
1
+ #! /local/ruby/bin/ruby
2
+ #
3
+ # $Id: porter.rb 37 2005-01-13 04:23:07Z gdf $
4
+ #
5
+ # See example usage at the end of this file.
6
+ #
7
+
8
+ module Stemmable
9
+
10
+ STEP_2_LIST = {
11
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
12
+ 'izer'=>'ize', 'bli'=>'ble',
13
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
14
+ 'ization'=>'ize', 'ation'=>'ate',
15
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
16
+ 'ousness'=>'ous', 'aliti'=>'al',
17
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
18
+ }
19
+
20
+ STEP_3_LIST = {
21
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
22
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
23
+ }
24
+
25
+
26
+ SUFFIX_1_REGEXP = /(
27
+ ational |
28
+ tional |
29
+ enci |
30
+ anci |
31
+ izer |
32
+ bli |
33
+ alli |
34
+ entli |
35
+ eli |
36
+ ousli |
37
+ ization |
38
+ ation |
39
+ ator |
40
+ alism |
41
+ iveness |
42
+ fulness |
43
+ ousness |
44
+ aliti |
45
+ iviti |
46
+ biliti |
47
+ logi)$/x
48
+
49
+
50
+ SUFFIX_2_REGEXP = /(
51
+ al |
52
+ ance |
53
+ ence |
54
+ er |
55
+ ic |
56
+ able |
57
+ ible |
58
+ ant |
59
+ ement |
60
+ ment |
61
+ ent |
62
+ ou |
63
+ ism |
64
+ ate |
65
+ iti |
66
+ ous |
67
+ ive |
68
+ ize)$/x
69
+
70
+
71
+ C = "[^aeiou]" # consonant
72
+ V = "[aeiouy]" # vowel
73
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
74
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
75
+
76
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
77
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
78
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
79
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
80
+
81
+ #
82
+ # Porter stemmer in Ruby.
83
+ #
84
+ # This is the Porter stemming algorithm, ported to Ruby from the
85
+ # version coded up in Perl. It's easy to follow against the rules
86
+ # in the original paper in:
87
+ #
88
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
89
+ # no. 3, pp 130-137,
90
+ #
91
+ # See also http://www.tartarus.org/~martin/PorterStemmer
92
+ #
93
+ # Send comments to raypereda@hotmail.com
94
+ #
95
+
96
+ def stem_porter
97
+
98
+ # make a copy of the given object and convert it to a string.
99
+ w = self.dup.to_str
100
+
101
+ return w if w.length < 3
102
+
103
+ # now map initial y to Y so that the patterns never treat it as vowel
104
+ w[0] = 'Y' if w[0] == ?y
105
+
106
+ # Step 1a
107
+ if w =~ /(ss|i)es$/
108
+ w = $` + $1
109
+ elsif w =~ /([^s])s$/
110
+ w = $` + $1
111
+ end
112
+
113
+ # Step 1b
114
+ if w =~ /eed$/
115
+ w.chop! if $` =~ MGR0
116
+ elsif w =~ /(ed|ing)$/
117
+ stem = $`
118
+ if stem =~ VOWEL_IN_STEM
119
+ w = stem
120
+ case w
121
+ when /(at|bl|iz)$/ then w << "e"
122
+ when /([^aeiouylsz])\1$/ then w.chop!
123
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
124
+ end
125
+ end
126
+ end
127
+
128
+ if w =~ /y$/
129
+ stem = $`
130
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
131
+ end
132
+
133
+ # Step 2
134
+ if w =~ SUFFIX_1_REGEXP
135
+ stem = $`
136
+ suffix = $1
137
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
138
+ if stem =~ MGR0
139
+ w = stem + STEP_2_LIST[suffix]
140
+ end
141
+ end
142
+
143
+ # Step 3
144
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
145
+ stem = $`
146
+ suffix = $1
147
+ if stem =~ MGR0
148
+ w = stem + STEP_3_LIST[suffix]
149
+ end
150
+ end
151
+
152
+ # Step 4
153
+ if w =~ SUFFIX_2_REGEXP
154
+ stem = $`
155
+ if stem =~ MGR1
156
+ w = stem
157
+ end
158
+ elsif w =~ /(s|t)(ion)$/
159
+ stem = $` + $1
160
+ if stem =~ MGR1
161
+ w = stem
162
+ end
163
+ end
164
+
165
+ # Step 5
166
+ if w =~ /e$/
167
+ stem = $`
168
+ if (stem =~ MGR1) ||
169
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
170
+ w = stem
171
+ end
172
+ end
173
+
174
+ if w =~ /ll$/ && w =~ MGR1
175
+ w.chop!
176
+ end
177
+
178
+ # and turn initial Y back to y
179
+ w[0] = 'y' if w[0] == ?Y
180
+
181
+ w
182
+ end
183
+
184
+
185
+ #
186
+ # make the stem_porter the default stem method, just in case we
187
+ # feel like having multiple stemmers available later.
188
+ #
189
+ alias stem stem_porter
190
+
191
+ end
192
+
193
+
194
+
195
+ #
196
+ # Make this script executable, and send it words on stdin, one per
197
+ # line, and it will output the stemmed versions to stdout.
198
+ #
199
+ if $0 == __FILE__ then
200
+ class String
201
+ include Stemmable
202
+ end
203
+
204
+ # the String class, and any subclasses of it you might have, now know
205
+ # how to stem things.
206
+
207
+ $stdin.each do |word|
208
+ puts word.stem
209
+ end
210
+ end
211
+
212
+
213
+
@@ -0,0 +1,98 @@
1
+
2
+ require 'rsi/logmanager'
3
+
4
+ module RSI
5
+
6
+ class Query
7
+ include Loggable
8
+
9
+ def initialize()
10
+ @subqueries = []
11
+ end
12
+
13
+ def add_subquery( query )
14
+ @subqueries << query
15
+ end
16
+
17
+ def evaluate( locator ); end
18
+
19
+ end
20
+
21
+ class ANDQuery < Query
22
+ def evaluate( locator )
23
+ ret_set = nil
24
+ @subqueries.each do |q|
25
+ set = q.evaluate( locator )
26
+ if ret_set.nil?
27
+ ret_set = set
28
+ else
29
+ ret_set = ret_set & set
30
+ end
31
+ # short-circuit bottoming out
32
+ if ret_set.size()==0
33
+ return ret_set
34
+ end
35
+ end
36
+ return ret_set
37
+ end
38
+
39
+ def to_s
40
+ return "( " + @subqueries.join(" AND ") + " )";
41
+ end
42
+ end
43
+
44
+ class ORQuery < Query
45
+ def evaluate()
46
+ ret_set = []
47
+ @subqueries.each do |q|
48
+ ret_set = ret_set | q.evaluate( locator )
49
+ end
50
+ return ret_set
51
+ end
52
+
53
+ def to_s
54
+ return "( " + @subqueries.join(" OR ") + " )";
55
+ end
56
+ end
57
+
58
+ class TermQuery < Query
59
+ attr_accessor :field, :term
60
+ def initialize( field, term )
61
+ @field = field
62
+ @term = term
63
+ end
64
+ def evaluate( locator )
65
+ logger.debug( "Getting dict for #@field" )
66
+ dict = locator.get_dict_for_field( @field )
67
+ # get all docids containing @field:@term -> []
68
+ # return set
69
+ unless dict.has_term?( term )
70
+ logger.debug( "Dict has no such term #{term}" )
71
+ return []
72
+ else
73
+ ret = []
74
+ termid = dict.get_termid_for( term )
75
+ logger.debug( "Getting entries for #{term}(#{termid})" )
76
+ dict.get_entry_list( termid ).each do |termentry|
77
+ logger.debug( termentry.to_s )
78
+ ret << termentry.docid
79
+ end
80
+ return ret.uniq
81
+ end
82
+ end
83
+ def to_s
84
+ return "#@field='#@term'"
85
+ end
86
+ end
87
+
88
+ ##; def analyze_query( q_str )
89
+ ##; # (a OR b) AND (c OR d)
90
+ ##; # -> AND[ OR[a,b], OR[c,d] ]
91
+ ##; # split on whitespace
92
+ ##; # split x:foo
93
+ ##; # tokenize foo
94
+ ##; # add another AND termquery
95
+ ##;
96
+ ##; end
97
+
98
+ end
@@ -0,0 +1,91 @@
1
+ #
2
+ # = RSI (Ruby Simple Indexer)
3
+ #
4
+ # RSI is a simple full text search engine implementation in Ruby. It
5
+ # aims to be easily useful within other programs: simple to set up,
6
+ # simple to use.
7
+ #
8
+ # An emphasis has been placed on getting functionality out the door,
9
+ # rather than heavy optimization (that can come later). It still
10
+ # appears to be reasonably fast and efficient (while admitting to have
11
+ # not been heavily profiled...).
12
+ #
13
+ # == Getting RSI
14
+ #
15
+ # RSI can be downloaded from Rubyforge (http://rubyforge.org/projects/rsi/).
16
+ #
17
+ # == Using RSI
18
+ #
19
+ # Creating an index:
20
+ #
21
+ # require 'rsi'
22
+ # indexer = RSI::Index.new( "/path/to/index" )
23
+ # Dir.foreach( "~/words" ) do |textfile|
24
+ # indexer.add_document( textfile, File.read("~/words/#{textfile}") )
25
+ # end
26
+ # indexer.flush()
27
+ #
28
+ # By default, the RSI indexer assumes that documents fed to it are plain
29
+ # text docs (more complex analyzers should appear in future releases).
30
+ #
31
+ # Searching an index:
32
+ #
33
+ # require 'rsi'
34
+ # indexer = RSI::Index.new( "/path/to/index" )
35
+ # puts indexer.find_all( "some three terms" )
36
+ #
37
+ # == Advanced Usage
38
+ #
39
+ # (Tweakability will be enhanced in future releases.)
40
+ #
41
+ # require 'rsi'
42
+ #
43
+ # indexer = RSI::Indexer.new( "/data/search" )
44
+ # indexer.serializer = RSI::NativeSerializer.new()
45
+ # indexer.analyzer = RSI::DefaultTextAnalyzer.new()
46
+ # indexer.query_analyzer = RSI::DefaultTextAnalyzer.new()
47
+ #
48
+ # === Changing the dictionary serializer
49
+ #
50
+ # The dictionary's serializer controls how the index database is
51
+ # stored. By default, RSI uses Ruby's Marshal to store the database
52
+ # objects. These serializers are also available:
53
+ #
54
+ # * RSI::NativeSerializer - default, uses Ruby's built-in Marshal lib.
55
+ #
56
+ # * RSI::YAMLSerializer - serializes DB objects as YAML. Excellent for
57
+ # debugging purposes. Very slow compared to NativeSerializer.
58
+ #
59
+ # * RSI::CompressedSerializer - uses Marshall (by default), plus
60
+ # compresses the output with bzip. The speed penalty is probably not
61
+ # worth the space savings (at least the way the db is currently
62
+ # implemented). Also requires the `bz2` library.
63
+ #
64
+ # Naturally, if you create an index with a give serializer, you will
65
+ # need to re-open the index with that same serializer. (This should be
66
+ # auto-detected in future releases.)
67
+ #
68
+ # === Changing the analyzer
69
+ #
70
+ # The analyzer is used both to tokenize documents into indexable
71
+ # terms. The default analyzer splits on whitespace and performs some
72
+ # normalization (stemming, stopword removal, etc).
73
+ #
74
+ # The query analyzer is used to tokenize query terms.
75
+ #
76
+ # Currently there are no other analyzers available (see Roadmap).
77
+ #
78
+ # === Changing the stoplist
79
+ #
80
+ # The default stoplist is pretty minimal (see stoplist.rb).
81
+ #
82
+ # (should be easier: see Development Roadmap)
83
+ #
84
+ # class MyAnalyzer < RSI::Analyzer
85
+ # def initialize_stoplist()
86
+ # return unless @stoplist.nil?
87
+ # @stoplist = { 'THE' => 1, ... }
88
+ # end
89
+ # end
90
+ #
91
+ module RSI; end