rsi 0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,105 @@
1
+
2
+ require 'tmpdir'
3
+ require 'logger'
4
+ require 'singleton'
5
+
6
+ module RSI
7
+
8
+ # Mixin providing a RSI::LogManager-managed #logger() method.
9
+ # #logger() returns a Logger object.
10
+ #
11
+ # class StuffThing
12
+ # include RSI::Loggable
13
+ # def do_dealie()
14
+ # logger.info( "Doing some dealie" )
15
+ # end
16
+ # end
17
+ #
18
+ # The settings of the logger returned by #logger() can be modified:
19
+ #
20
+ # def initialize()
21
+ # logger.level = Logger.DEBUG # log all messages
22
+ # logger.debug( "This will show up in the log, now" )
23
+ # end
24
+ #
25
+ module Loggable
26
+ def logger
27
+ return RSI::LogManager.instance().logger_for( self )
28
+ end
29
+ end
30
+
31
+ # Trivial extension of Logger, providing it a #write() method.
32
+ # This allows instances of this logger to be used as the
33
+ # argument to Logger#new().
34
+ #
35
+ # root = XLogger.new( "foo.log" )
36
+ # other = Logger.new( root )
37
+ #
38
+ class XLogger < Logger
39
+ def write( msg )
40
+ @logdev.write( msg )
41
+ end
42
+ end
43
+
44
+ # Manages logger creation for classes which mixin RSI::Loggable.
45
+ # LogManager has default settings for the log directory (Dir::tmpdir)
46
+ # and for the log file name ("app.log").
47
+ #
48
+ # If you'd like to override the defaults, call #root=()
49
+ # and/or #log_filename=() before LogManager is first used (ie,
50
+ # before RSI::Loggable#logger() is called the first time).
51
+ # You can also supply an arbitrary IO to #root_fh=() .
52
+ #
53
+ # By default, the LogManager will create logs with level set
54
+ # to Logger::INFO. Individual classes mixing in RSI::Loggable may
55
+ # choose to override this by calling #logger.level=() .
56
+ #
57
+ class LogManager
58
+ include Singleton
59
+ attr_reader :root_logger
60
+ attr_accessor :root, :log_filename, :root_fh
61
+
62
+ def initialize()
63
+ @root = Dir::tmpdir
64
+ @log_filename = "app.log"
65
+ @root_fh = nil
66
+ @logger_cache = {}
67
+ @root_logger = nil
68
+ end
69
+
70
+ # Gets the logger for a class.
71
+ # Can be passed an object, a Class, or a String.
72
+ public
73
+ def logger_for( obj="root" )
74
+ if obj.kind_of?( String )
75
+ n = obj
76
+ elsif obj.kind_of?( Module )
77
+ n = obj.name
78
+ else
79
+ n = obj.class.name
80
+ end
81
+ unless @logger_cache.has_key?( n )
82
+ configure() if @root_logger.nil?
83
+ @logger_cache[n] = Logger.new( @root_logger )
84
+ @logger_cache[n].progname = n
85
+ @logger_cache[n].level = Logger::INFO
86
+ end
87
+ return @logger_cache[n]
88
+ end
89
+
90
+ private
91
+ def configure()
92
+ if @root_fh.nil?
93
+ @root_fh = File.open( File.join(@root, @log_filename),
94
+ File::WRONLY | File::APPEND | File::CREAT )
95
+ @root_fh.sync = true
96
+ end
97
+ @root_logger = XLogger.new( @root_fh )
98
+ @root_logger.progname = "root"
99
+ #setting @root_logger.level seems to screw things up
100
+ end
101
+
102
+ end
103
+ end
104
+
105
+
@@ -0,0 +1,213 @@
1
+ #! /local/ruby/bin/ruby
2
+ #
3
+ # $Id: porter.rb 37 2005-01-13 04:23:07Z gdf $
4
+ #
5
+ # See example usage at the end of this file.
6
+ #
7
+
8
+ module Stemmable
9
+
10
+ STEP_2_LIST = {
11
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
12
+ 'izer'=>'ize', 'bli'=>'ble',
13
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
14
+ 'ization'=>'ize', 'ation'=>'ate',
15
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
16
+ 'ousness'=>'ous', 'aliti'=>'al',
17
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
18
+ }
19
+
20
+ STEP_3_LIST = {
21
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
22
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
23
+ }
24
+
25
+
26
+ SUFFIX_1_REGEXP = /(
27
+ ational |
28
+ tional |
29
+ enci |
30
+ anci |
31
+ izer |
32
+ bli |
33
+ alli |
34
+ entli |
35
+ eli |
36
+ ousli |
37
+ ization |
38
+ ation |
39
+ ator |
40
+ alism |
41
+ iveness |
42
+ fulness |
43
+ ousness |
44
+ aliti |
45
+ iviti |
46
+ biliti |
47
+ logi)$/x
48
+
49
+
50
+ SUFFIX_2_REGEXP = /(
51
+ al |
52
+ ance |
53
+ ence |
54
+ er |
55
+ ic |
56
+ able |
57
+ ible |
58
+ ant |
59
+ ement |
60
+ ment |
61
+ ent |
62
+ ou |
63
+ ism |
64
+ ate |
65
+ iti |
66
+ ous |
67
+ ive |
68
+ ize)$/x
69
+
70
+
71
+ C = "[^aeiou]" # consonant
72
+ V = "[aeiouy]" # vowel
73
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
74
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
75
+
76
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
77
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
78
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
79
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
80
+
81
+ #
82
+ # Porter stemmer in Ruby.
83
+ #
84
+ # This is the Porter stemming algorithm, ported to Ruby from the
85
+ # version coded up in Perl. It's easy to follow against the rules
86
+ # in the original paper in:
87
+ #
88
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
89
+ # no. 3, pp 130-137,
90
+ #
91
+ # See also http://www.tartarus.org/~martin/PorterStemmer
92
+ #
93
+ # Send comments to raypereda@hotmail.com
94
+ #
95
+
96
+ def stem_porter
97
+
98
+ # make a copy of the given object and convert it to a string.
99
+ w = self.dup.to_str
100
+
101
+ return w if w.length < 3
102
+
103
+ # now map initial y to Y so that the patterns never treat it as vowel
104
+ w[0] = 'Y' if w[0] == ?y
105
+
106
+ # Step 1a
107
+ if w =~ /(ss|i)es$/
108
+ w = $` + $1
109
+ elsif w =~ /([^s])s$/
110
+ w = $` + $1
111
+ end
112
+
113
+ # Step 1b
114
+ if w =~ /eed$/
115
+ w.chop! if $` =~ MGR0
116
+ elsif w =~ /(ed|ing)$/
117
+ stem = $`
118
+ if stem =~ VOWEL_IN_STEM
119
+ w = stem
120
+ case w
121
+ when /(at|bl|iz)$/ then w << "e"
122
+ when /([^aeiouylsz])\1$/ then w.chop!
123
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
124
+ end
125
+ end
126
+ end
127
+
128
+ if w =~ /y$/
129
+ stem = $`
130
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
131
+ end
132
+
133
+ # Step 2
134
+ if w =~ SUFFIX_1_REGEXP
135
+ stem = $`
136
+ suffix = $1
137
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
138
+ if stem =~ MGR0
139
+ w = stem + STEP_2_LIST[suffix]
140
+ end
141
+ end
142
+
143
+ # Step 3
144
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
145
+ stem = $`
146
+ suffix = $1
147
+ if stem =~ MGR0
148
+ w = stem + STEP_3_LIST[suffix]
149
+ end
150
+ end
151
+
152
+ # Step 4
153
+ if w =~ SUFFIX_2_REGEXP
154
+ stem = $`
155
+ if stem =~ MGR1
156
+ w = stem
157
+ end
158
+ elsif w =~ /(s|t)(ion)$/
159
+ stem = $` + $1
160
+ if stem =~ MGR1
161
+ w = stem
162
+ end
163
+ end
164
+
165
+ # Step 5
166
+ if w =~ /e$/
167
+ stem = $`
168
+ if (stem =~ MGR1) ||
169
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
170
+ w = stem
171
+ end
172
+ end
173
+
174
+ if w =~ /ll$/ && w =~ MGR1
175
+ w.chop!
176
+ end
177
+
178
+ # and turn initial Y back to y
179
+ w[0] = 'y' if w[0] == ?Y
180
+
181
+ w
182
+ end
183
+
184
+
185
+ #
186
+ # make the stem_porter the default stem method, just in case we
187
+ # feel like having multiple stemmers available later.
188
+ #
189
+ alias stem stem_porter
190
+
191
+ end
192
+
193
+
194
+
195
+ #
196
+ # Make this script executable, and send it words on stdin, one per
197
+ # line, and it will output the stemmed versions to stdout.
198
+ #
199
+ if $0 == __FILE__ then
200
+ class String
201
+ include Stemmable
202
+ end
203
+
204
+ # the String class, and any subclasses of it you might have, now know
205
+ # how to stem things.
206
+
207
+ $stdin.each do |word|
208
+ puts word.stem
209
+ end
210
+ end
211
+
212
+
213
+
@@ -0,0 +1,98 @@
1
+
2
+ require 'rsi/logmanager'
3
+
4
+ module RSI
5
+
6
+ class Query
7
+ include Loggable
8
+
9
+ def initialize()
10
+ @subqueries = []
11
+ end
12
+
13
+ def add_subquery( query )
14
+ @subqueries << query
15
+ end
16
+
17
+ def evaluate( locator ); end
18
+
19
+ end
20
+
21
+ class ANDQuery < Query
22
+ def evaluate( locator )
23
+ ret_set = nil
24
+ @subqueries.each do |q|
25
+ set = q.evaluate( locator )
26
+ if ret_set.nil?
27
+ ret_set = set
28
+ else
29
+ ret_set = ret_set & set
30
+ end
31
+ # short-circuit bottoming out
32
+ if ret_set.size()==0
33
+ return ret_set
34
+ end
35
+ end
36
+ return ret_set
37
+ end
38
+
39
+ def to_s
40
+ return "( " + @subqueries.join(" AND ") + " )";
41
+ end
42
+ end
43
+
44
+ class ORQuery < Query
45
+ def evaluate()
46
+ ret_set = []
47
+ @subqueries.each do |q|
48
+ ret_set = ret_set | q.evaluate( locator )
49
+ end
50
+ return ret_set
51
+ end
52
+
53
+ def to_s
54
+ return "( " + @subqueries.join(" OR ") + " )";
55
+ end
56
+ end
57
+
58
+ class TermQuery < Query
59
+ attr_accessor :field, :term
60
+ def initialize( field, term )
61
+ @field = field
62
+ @term = term
63
+ end
64
+ def evaluate( locator )
65
+ logger.debug( "Getting dict for #@field" )
66
+ dict = locator.get_dict_for_field( @field )
67
+ # get all docids containing @field:@term -> []
68
+ # return set
69
+ unless dict.has_term?( term )
70
+ logger.debug( "Dict has no such term #{term}" )
71
+ return []
72
+ else
73
+ ret = []
74
+ termid = dict.get_termid_for( term )
75
+ logger.debug( "Getting entries for #{term}(#{termid})" )
76
+ dict.get_entry_list( termid ).each do |termentry|
77
+ logger.debug( termentry.to_s )
78
+ ret << termentry.docid
79
+ end
80
+ return ret.uniq
81
+ end
82
+ end
83
+ def to_s
84
+ return "#@field='#@term'"
85
+ end
86
+ end
87
+
88
+ ##; def analyze_query( q_str )
89
+ ##; # (a OR b) AND (c OR d)
90
+ ##; # -> AND[ OR[a,b], OR[c,d] ]
91
+ ##; # split on whitespace
92
+ ##; # split x:foo
93
+ ##; # tokenize foo
94
+ ##; # add another AND termquery
95
+ ##;
96
+ ##; end
97
+
98
+ end
@@ -0,0 +1,91 @@
1
+ #
2
+ # = RSI (Ruby Simple Indexer)
3
+ #
4
+ # RSI is a simple full text search engine implementation in Ruby. It
5
+ # aims to be easily useful within other programs: simple to set up,
6
+ # simple to use.
7
+ #
8
+ # An emphasis has been placed on getting functionality out the door,
9
+ # rather than heavy optimization (that can come later). It still
10
+ # appears to be reasonably fast and efficient (while admitting to have
11
+ # not been heavily profiled...).
12
+ #
13
+ # == Getting RSI
14
+ #
15
+ # RSI can be downloaded from Rubyforge (http://rubyforge.org/projects/rsi/).
16
+ #
17
+ # == Using RSI
18
+ #
19
+ # Creating an index:
20
+ #
21
+ # require 'rsi'
22
+ # indexer = RSI::Index.new( "/path/to/index" )
23
+ # Dir.foreach( "~/words" ) do |textfile|
24
+ # indexer.add_document( textfile, File.read("~/words/#{textfile}") )
25
+ # end
26
+ # indexer.flush()
27
+ #
28
+ # By default, the RSI indexer assumes that documents fed to it are plain
29
+ # text docs (more complex analyzers should appear in future releases).
30
+ #
31
+ # Searching an index:
32
+ #
33
+ # require 'rsi'
34
+ # indexer = RSI::Index.new( "/path/to/index" )
35
+ # puts indexer.find_all( "some three terms" )
36
+ #
37
+ # == Advanced Usage
38
+ #
39
+ # (Tweakability will be enhanced in future releases.)
40
+ #
41
+ # require 'rsi'
42
+ #
43
+ # indexer = RSI::Indexer.new( "/data/search" )
44
+ # indexer.serializer = RSI::NativeSerializer.new()
45
+ # indexer.analyzer = RSI::DefaultTextAnalyzer.new()
46
+ # indexer.query_analyzer = RSI::DefaultTextAnalyzer.new()
47
+ #
48
+ # === Changing the dictionary serializer
49
+ #
50
+ # The dictionary's serializer controls how the index database is
51
+ # stored. By default, RSI uses Ruby's Marshal to store the database
52
+ # objects. These serializers are also available:
53
+ #
54
+ # * RSI::NativeSerializer - default, uses Ruby's built-in Marshal lib.
55
+ #
56
+ # * RSI::YAMLSerializer - serializes DB objects as YAML. Excellent for
57
+ # debugging purposes. Very slow compared to NativeSerializer.
58
+ #
59
+ # * RSI::CompressedSerializer - uses Marshall (by default), plus
60
+ # compresses the output with bzip. The speed penalty is probably not
61
+ # worth the space savings (at least the way the db is currently
62
+ # implemented). Also requires the `bz2` library.
63
+ #
64
+ # Naturally, if you create an index with a give serializer, you will
65
+ # need to re-open the index with that same serializer. (This should be
66
+ # auto-detected in future releases.)
67
+ #
68
+ # === Changing the analyzer
69
+ #
70
+ # The analyzer is used both to tokenize documents into indexable
71
+ # terms. The default analyzer splits on whitespace and performs some
72
+ # normalization (stemming, stopword removal, etc).
73
+ #
74
+ # The query analyzer is used to tokenize query terms.
75
+ #
76
+ # Currently there are no other analyzers available (see Roadmap).
77
+ #
78
+ # === Changing the stoplist
79
+ #
80
+ # The default stoplist is pretty minimal (see stoplist.rb).
81
+ #
82
+ # (should be easier: see Development Roadmap)
83
+ #
84
+ # class MyAnalyzer < RSI::Analyzer
85
+ # def initialize_stoplist()
86
+ # return unless @stoplist.nil?
87
+ # @stoplist = { 'THE' => 1, ... }
88
+ # end
89
+ # end
90
+ #
91
+ module RSI; end